2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
965 if (!ctx->nofirstsub)
967 STACK_PUSH(stack, int, ctx->submatch_id);
968 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
971 STACK_PUSH(stack, int, PARSE_RE);
972 ctx->re_start = ctx->re;
975 /* The following is basically just a recursive descent parser. I use
976 an explicit stack instead of recursive functions mostly because of
977 two reasons: compatibility with systems which have an overflowable
978 call stack, and efficiency (both in lines of code and speed). */
979 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
981 if (status != REG_OK)
983 symbol = tre_stack_pop_int(stack);
987 /* Parse a full regexp. A regexp is one or more branches,
988 separated by the union operator `|'. */
989 if (ctx->cflags & REG_EXTENDED)
990 STACK_PUSHX(stack, int, PARSE_UNION);
991 STACK_PUSHX(stack, int, PARSE_BRANCH);
995 /* Parse a branch. A branch is one or more pieces, concatenated.
996 A piece is an atom possibly followed by a postfix operator. */
997 STACK_PUSHX(stack, int, PARSE_CATENATION);
998 STACK_PUSHX(stack, int, PARSE_PIECE);
1002 /* Parse a piece. A piece is an atom possibly followed by one
1003 or more postfix operators. */
1004 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1005 STACK_PUSHX(stack, int, PARSE_ATOM);
1008 case PARSE_CATENATION:
1009 /* If the expression has not ended, parse another piece. */
1015 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1017 if ((ctx->cflags & REG_EXTENDED
1018 && c == CHAR_RPAREN && depth > 0)
1019 || (!(ctx->cflags & REG_EXTENDED)
1020 && (c == CHAR_BACKSLASH
1021 && *(ctx->re + 1) == CHAR_RPAREN)))
1023 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1024 status = REG_EPAREN;
1026 if (!(ctx->cflags & REG_EXTENDED))
1032 /* Default case, left associative concatenation. */
1033 STACK_PUSHX(stack, int, PARSE_CATENATION);
1034 STACK_PUSHX(stack, voidptr, result);
1035 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1036 STACK_PUSHX(stack, int, PARSE_PIECE);
1041 case PARSE_POST_CATENATION:
1043 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1044 tre_ast_node_t *tmp_node;
1045 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1058 STACK_PUSHX(stack, int, PARSE_UNION);
1059 STACK_PUSHX(stack, voidptr, result);
1060 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1061 STACK_PUSHX(stack, int, PARSE_BRANCH);
1074 case PARSE_POST_UNION:
1076 tre_ast_node_t *tmp_node;
1077 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1078 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1086 /* Parse postfix operators. */
1092 case CHAR_QUESTIONMARK:
1093 if (!(ctx->cflags & REG_EXTENDED))
1098 tre_ast_node_t *tmp_node;
1103 if (*ctx->re == CHAR_PLUS)
1105 if (*ctx->re == CHAR_QUESTIONMARK)
1109 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1114 else if (*(ctx->re + 1) == CHAR_STAR
1115 || *(ctx->re + 1) == CHAR_PLUS)
1117 /* These are reserved for future extensions. */
1123 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1125 if (tmp_node == NULL)
1128 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1132 case CHAR_BACKSLASH:
1133 /* "\{" is special without REG_EXTENDED */
1134 if (!(ctx->cflags & REG_EXTENDED)
1135 && *(ctx->re + 1) == CHAR_LBRACE)
1144 /* "{" is literal without REG_EXTENDED */
1145 if (!(ctx->cflags & REG_EXTENDED))
1151 status = tre_parse_bound(ctx, &result);
1152 if (status != REG_OK)
1154 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1160 /* Parse an atom. An atom is a regular expression enclosed in `()',
1161 an empty set of `()', a bracket expression, `.', `^', `$',
1162 a `\' followed by a character, or a single character. */
1164 /* End of regexp? (empty string). */
1170 case CHAR_LPAREN: /* parenthesized subexpression */
1172 if (ctx->cflags & REG_EXTENDED
1173 || (ctx->re > ctx->re_start
1174 && *(ctx->re - 1) == CHAR_BACKSLASH))
1179 /* First parse a whole RE, then mark the resulting tree
1181 STACK_PUSHX(stack, int, ctx->submatch_id);
1182 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1183 STACK_PUSHX(stack, int, PARSE_RE);
1191 case CHAR_RPAREN: /* end of current subexpression */
1192 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1193 || (ctx->re > ctx->re_start
1194 && *(ctx->re - 1) == CHAR_BACKSLASH))
1196 /* We were expecting an atom, but instead the current
1197 subexpression was closed. POSIX leaves the meaning of
1198 this to be implementation-defined. We interpret this as
1199 an empty expression (which matches an empty string). */
1200 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1203 if (!(ctx->cflags & REG_EXTENDED))
1210 case CHAR_LBRACKET: /* bracket expression */
1212 status = tre_parse_bracket(ctx, &result);
1213 if (status != REG_OK)
1217 case CHAR_BACKSLASH:
1218 /* If this is "\(" or "\)" chew off the backslash and
1220 if (!(ctx->cflags & REG_EXTENDED)
1221 && (*(ctx->re + 1) == CHAR_LPAREN
1222 || *(ctx->re + 1) == CHAR_RPAREN))
1225 STACK_PUSHX(stack, int, PARSE_ATOM);
1229 /* If a macro is used, parse the expanded macro recursively. */
1231 const char *buf = tre_expand_macro(ctx->re + 1);
1234 tre_parse_ctx_t subctx;
1235 memcpy(&subctx, ctx, sizeof(subctx));
1237 subctx.nofirstsub = 1;
1238 status = tre_parse(&subctx);
1239 if (status != REG_OK)
1242 ctx->position = subctx.position;
1243 result = subctx.result;
1249 /* Trailing backslash. */
1256 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1261 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1262 ASSERT_AT_WB_NEG, -1);
1266 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1271 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1277 if (ctx->re[0] != CHAR_LBRACE)
1279 /* 8 bit hex char. */
1280 char tmp[3] = {0, 0, 0};
1283 if (tre_isxdigit(ctx->re[0]))
1285 tmp[0] = (char)ctx->re[0];
1288 if (tre_isxdigit(ctx->re[0]))
1290 tmp[1] = (char)ctx->re[0];
1293 val = strtol(tmp, NULL, 16);
1294 result = tre_ast_new_literal(ctx->mem, (int)val,
1295 (int)val, ctx->position);
1306 while (*ctx->re && i < sizeof tmp)
1308 if (ctx->re[0] == CHAR_RBRACE)
1310 if (tre_isxdigit(ctx->re[0]))
1312 tmp[i] = (char)ctx->re[0];
1321 val = strtol(tmp, NULL, 16);
1322 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1330 if (tre_isdigit(*ctx->re))
1332 /* Back reference. */
1333 int val = *ctx->re - '0';
1334 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1339 ctx->max_backref = MAX(val, ctx->max_backref);
1344 /* Escaped character. */
1345 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1356 case CHAR_PERIOD: /* the any-symbol */
1357 if (ctx->cflags & REG_NEWLINE)
1359 tre_ast_node_t *tmp1;
1360 tre_ast_node_t *tmp2;
1361 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1365 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1369 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1376 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1385 case CHAR_CARET: /* beginning of line assertion */
1386 /* '^' has a special meaning everywhere in EREs, and in the
1387 beginning of the RE and after \( is BREs. */
1388 if (ctx->cflags & REG_EXTENDED
1389 || (ctx->re - 2 >= ctx->re_start
1390 && *(ctx->re - 2) == CHAR_BACKSLASH
1391 && *(ctx->re - 1) == CHAR_LPAREN)
1392 || ctx->re == ctx->re_start)
1394 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1404 case CHAR_DOLLAR: /* end of line assertion. */
1405 /* '$' is special everywhere in EREs, and in the end of the
1406 string and before \) is BREs. */
1407 if (ctx->cflags & REG_EXTENDED
1408 || (*(ctx->re + 1) == CHAR_BACKSLASH
1409 && *(ctx->re + 2) == CHAR_RPAREN)
1412 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1425 /* We are expecting an atom. If the subexpression (or the whole
1426 regexp ends here, we interpret it as an empty expression
1427 (which matches an empty string). */
1430 || *ctx->re == CHAR_STAR
1431 || (ctx->cflags & REG_EXTENDED
1432 && (*ctx->re == CHAR_PIPE
1433 || *ctx->re == CHAR_LBRACE
1434 || *ctx->re == CHAR_PLUS
1435 || *ctx->re == CHAR_QUESTIONMARK))
1436 /* Test for "\)" in BRE mode. */
1437 || (!(ctx->cflags & REG_EXTENDED)
1439 && *ctx->re == CHAR_BACKSLASH
1440 && *(ctx->re + 1) == CHAR_LBRACE)))
1442 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1449 int clen = mbtowc(&wc, ctx->re, -1);
1450 if (clen<0) clen=1, wc=WEOF;
1452 /* Note that we can't use an tre_isalpha() test here, since there
1453 may be characters which are alphabetic but neither upper or
1455 if (ctx->cflags & REG_ICASE
1456 && (tre_isupper(wc) || tre_islower(wc)))
1458 tre_ast_node_t *tmp1;
1459 tre_ast_node_t *tmp2;
1461 /* XXX - Can there be more than one opposite-case
1462 counterpoints for some character in some locale? Or
1463 more than two characters which all should be regarded
1464 the same character if case is ignored? If yes, there
1465 does not seem to be a portable way to detect it. I guess
1466 that at least for multi-character collating elements there
1467 could be several opposite-case counterpoints, but they
1468 cannot be supported portably anyway. */
1469 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1474 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1479 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1485 result = tre_ast_new_literal(ctx->mem, wc, wc,
1496 case PARSE_MARK_FOR_SUBMATCH:
1498 int submatch_id = tre_stack_pop_int(stack);
1500 if (result->submatch_id >= 0)
1502 tre_ast_node_t *n, *tmp_node;
1503 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1506 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1507 if (tmp_node == NULL)
1509 tmp_node->num_submatches = result->num_submatches;
1512 result->submatch_id = submatch_id;
1513 result->num_submatches++;
1517 case PARSE_RESTORE_CFLAGS:
1518 ctx->cflags = tre_stack_pop_int(stack);
1527 /* Check for missing closing parentheses. */
1531 if (status == REG_OK)
1532 ctx->result = result;
1539 /***********************************************************************
1541 ***********************************************************************/
1546 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1551 Algorithms to setup tags so that submatch addressing can be done.
1555 /* Inserts a catenation node to the root of the tree given in `node'.
1556 As the left child a new tag with number `tag_id' to `node' is added,
1557 and the right child is the old root. */
1558 static reg_errcode_t
1559 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1561 tre_catenation_t *c;
1563 c = tre_mem_alloc(mem, sizeof(*c));
1566 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1567 if (c->left == NULL)
1569 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1570 if (c->right == NULL)
1573 c->right->obj = node->obj;
1574 c->right->type = node->type;
1575 c->right->nullable = -1;
1576 c->right->submatch_id = -1;
1577 c->right->firstpos = NULL;
1578 c->right->lastpos = NULL;
1579 c->right->num_tags = 0;
1581 node->type = CATENATION;
1585 /* Inserts a catenation node to the root of the tree given in `node'.
1586 As the right child a new tag with number `tag_id' to `node' is added,
1587 and the left child is the old root. */
1588 static reg_errcode_t
1589 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1591 tre_catenation_t *c;
1593 c = tre_mem_alloc(mem, sizeof(*c));
1596 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1597 if (c->right == NULL)
1599 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1600 if (c->left == NULL)
1603 c->left->obj = node->obj;
1604 c->left->type = node->type;
1605 c->left->nullable = -1;
1606 c->left->submatch_id = -1;
1607 c->left->firstpos = NULL;
1608 c->left->lastpos = NULL;
1609 c->left->num_tags = 0;
1611 node->type = CATENATION;
1617 ADDTAGS_AFTER_ITERATION,
1618 ADDTAGS_AFTER_UNION_LEFT,
1619 ADDTAGS_AFTER_UNION_RIGHT,
1620 ADDTAGS_AFTER_CAT_LEFT,
1621 ADDTAGS_AFTER_CAT_RIGHT,
1622 ADDTAGS_SET_SUBMATCH_END
1623 } tre_addtags_symbol_t;
1632 /* Go through `regset' and set submatch data for submatches that are
1635 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1639 for (i = 0; regset[i] >= 0; i++)
1641 int id = regset[i] / 2;
1642 int start = !(regset[i] % 2);
1644 tnfa->submatch_data[id].so_tag = tag;
1646 tnfa->submatch_data[id].eo_tag = tag;
1652 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1653 subexpressions marked for submatch addressing can be traced. */
1654 static reg_errcode_t
1655 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1658 reg_errcode_t status = REG_OK;
1659 tre_addtags_symbol_t symbol;
1660 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1661 int bottom = tre_stack_num_objects(stack);
1662 /* True for first pass (counting number of needed tags) */
1663 int first_pass = (mem == NULL || tnfa == NULL);
1664 int *regset, *orig_regset;
1665 int num_tags = 0; /* Total number of tags. */
1666 int num_minimals = 0; /* Number of special minimal tags. */
1667 int tag = 0; /* The tag that is to be added next. */
1668 int next_tag = 1; /* Next tag to use after this one. */
1669 int *parents; /* Stack of submatches the current submatch is
1671 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1672 tre_tag_states_t *saved_states;
1674 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1678 tnfa->minimal_tags[0] = -1;
1681 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1685 orig_regset = regset;
1687 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1688 if (parents == NULL)
1695 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1696 if (saved_states == NULL)
1705 for (i = 0; i <= tnfa->num_submatches; i++)
1706 saved_states[i].tag = -1;
1709 STACK_PUSH(stack, voidptr, node);
1710 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1712 while (tre_stack_num_objects(stack) > bottom)
1714 if (status != REG_OK)
1717 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1721 case ADDTAGS_SET_SUBMATCH_END:
1723 int id = tre_stack_pop_int(stack);
1726 /* Add end of this submatch to regset. */
1727 for (i = 0; regset[i] >= 0; i++);
1728 regset[i] = id * 2 + 1;
1731 /* Pop this submatch from the parents stack. */
1732 for (i = 0; parents[i] >= 0; i++);
1733 parents[i - 1] = -1;
1737 case ADDTAGS_RECURSE:
1738 node = tre_stack_pop_voidptr(stack);
1740 if (node->submatch_id >= 0)
1742 int id = node->submatch_id;
1746 /* Add start of this submatch to regset. */
1747 for (i = 0; regset[i] >= 0; i++);
1753 for (i = 0; parents[i] >= 0; i++);
1754 tnfa->submatch_data[id].parents = NULL;
1757 int *p = xmalloc(sizeof(*p) * (i + 1));
1760 status = REG_ESPACE;
1763 assert(tnfa->submatch_data[id].parents == NULL);
1764 tnfa->submatch_data[id].parents = p;
1765 for (i = 0; parents[i] >= 0; i++)
1771 /* Add end of this submatch to regset after processing this
1773 STACK_PUSHX(stack, int, node->submatch_id);
1774 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1781 tre_literal_t *lit = node->obj;
1783 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1788 /* Regset is not empty, so add a tag before the
1789 literal or backref. */
1792 status = tre_add_tag_left(mem, node, tag);
1793 tnfa->tag_directions[tag] = direction;
1794 if (minimal_tag >= 0)
1796 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1797 tnfa->minimal_tags[i] = tag;
1798 tnfa->minimal_tags[i + 1] = minimal_tag;
1799 tnfa->minimal_tags[i + 2] = -1;
1803 tre_purge_regset(regset, tnfa, tag);
1818 assert(!IS_TAG(lit));
1824 tre_catenation_t *cat = node->obj;
1825 tre_ast_node_t *left = cat->left;
1826 tre_ast_node_t *right = cat->right;
1827 int reserved_tag = -1;
1830 /* After processing right child. */
1831 STACK_PUSHX(stack, voidptr, node);
1832 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1834 /* Process right child. */
1835 STACK_PUSHX(stack, voidptr, right);
1836 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1838 /* After processing left child. */
1839 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1840 if (left->num_tags > 0 && right->num_tags > 0)
1842 /* Reserve the next tag to the right child. */
1843 reserved_tag = next_tag;
1846 STACK_PUSHX(stack, int, reserved_tag);
1847 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1849 /* Process left child. */
1850 STACK_PUSHX(stack, voidptr, left);
1851 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1857 tre_iteration_t *iter = node->obj;
1861 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1865 STACK_PUSHX(stack, int, tag);
1866 STACK_PUSHX(stack, int, iter->minimal);
1868 STACK_PUSHX(stack, voidptr, node);
1869 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1871 STACK_PUSHX(stack, voidptr, iter->arg);
1872 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1874 /* Regset is not empty, so add a tag here. */
1875 if (regset[0] >= 0 || iter->minimal)
1880 status = tre_add_tag_left(mem, node, tag);
1882 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1884 tnfa->tag_directions[tag] = direction;
1885 if (minimal_tag >= 0)
1887 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1888 tnfa->minimal_tags[i] = tag;
1889 tnfa->minimal_tags[i + 1] = minimal_tag;
1890 tnfa->minimal_tags[i + 2] = -1;
1894 tre_purge_regset(regset, tnfa, tag);
1902 direction = TRE_TAG_MINIMIZE;
1907 tre_union_t *uni = node->obj;
1908 tre_ast_node_t *left = uni->left;
1909 tre_ast_node_t *right = uni->right;
1915 left_tag = next_tag;
1916 right_tag = next_tag + 1;
1921 right_tag = next_tag;
1924 /* After processing right child. */
1925 STACK_PUSHX(stack, int, right_tag);
1926 STACK_PUSHX(stack, int, left_tag);
1927 STACK_PUSHX(stack, voidptr, regset);
1928 STACK_PUSHX(stack, int, regset[0] >= 0);
1929 STACK_PUSHX(stack, voidptr, node);
1930 STACK_PUSHX(stack, voidptr, right);
1931 STACK_PUSHX(stack, voidptr, left);
1932 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1934 /* Process right child. */
1935 STACK_PUSHX(stack, voidptr, right);
1936 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1938 /* After processing left child. */
1939 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1941 /* Process left child. */
1942 STACK_PUSHX(stack, voidptr, left);
1943 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1945 /* Regset is not empty, so add a tag here. */
1951 status = tre_add_tag_left(mem, node, tag);
1952 tnfa->tag_directions[tag] = direction;
1953 if (minimal_tag >= 0)
1955 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1956 tnfa->minimal_tags[i] = tag;
1957 tnfa->minimal_tags[i + 1] = minimal_tag;
1958 tnfa->minimal_tags[i + 2] = -1;
1962 tre_purge_regset(regset, tnfa, tag);
1971 if (node->num_submatches > 0)
1973 /* The next two tags are reserved for markers. */
1983 if (node->submatch_id >= 0)
1986 /* Push this submatch on the parents stack. */
1987 for (i = 0; parents[i] >= 0; i++);
1988 parents[i] = node->submatch_id;
1989 parents[i + 1] = -1;
1992 break; /* end case: ADDTAGS_RECURSE */
1994 case ADDTAGS_AFTER_ITERATION:
1998 node = tre_stack_pop_voidptr(stack);
2001 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
2002 + tre_stack_pop_int(stack);
2007 minimal = tre_stack_pop_int(stack);
2008 enter_tag = tre_stack_pop_int(stack);
2010 minimal_tag = enter_tag;
2016 direction = TRE_TAG_MINIMIZE;
2018 direction = TRE_TAG_MAXIMIZE;
2023 case ADDTAGS_AFTER_CAT_LEFT:
2025 int new_tag = tre_stack_pop_int(stack);
2026 next_tag = tre_stack_pop_int(stack);
2034 case ADDTAGS_AFTER_CAT_RIGHT:
2035 node = tre_stack_pop_voidptr(stack);
2037 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
2038 + ((tre_catenation_t *)node->obj)->right->num_tags;
2041 case ADDTAGS_AFTER_UNION_LEFT:
2042 /* Lift the bottom of the `regset' array so that when processing
2043 the right operand the items currently in the array are
2044 invisible. The original bottom was saved at ADDTAGS_UNION and
2045 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2046 while (*regset >= 0)
2050 case ADDTAGS_AFTER_UNION_RIGHT:
2052 int added_tags, tag_left, tag_right;
2053 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2054 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2055 node = tre_stack_pop_voidptr(stack);
2056 added_tags = tre_stack_pop_int(stack);
2059 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2060 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2061 + ((node->num_submatches > 0) ? 2 : 0);
2063 regset = tre_stack_pop_voidptr(stack);
2064 tag_left = tre_stack_pop_int(stack);
2065 tag_right = tre_stack_pop_int(stack);
2067 /* Add tags after both children, the left child gets a smaller
2068 tag than the right child. This guarantees that we prefer
2069 the left child over the right child. */
2070 /* XXX - This is not always necessary (if the children have
2071 tags which must be seen for every match of that child). */
2072 /* XXX - Check if this is the only place where tre_add_tag_right
2073 is used. If so, use tre_add_tag_left (putting the tag before
2074 the child as opposed after the child) and throw away
2075 tre_add_tag_right. */
2076 if (node->num_submatches > 0)
2080 status = tre_add_tag_right(mem, left, tag_left);
2081 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2082 status = tre_add_tag_right(mem, right, tag_right);
2083 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2087 direction = TRE_TAG_MAXIMIZE;
2095 } /* end switch(symbol) */
2096 } /* end while(tre_stack_num_objects(stack) > bottom) */
2099 tre_purge_regset(regset, tnfa, tag);
2101 if (!first_pass && minimal_tag >= 0)
2104 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2105 tnfa->minimal_tags[i] = tag;
2106 tnfa->minimal_tags[i + 1] = minimal_tag;
2107 tnfa->minimal_tags[i + 2] = -1;
2112 assert(tree->num_tags == num_tags);
2113 tnfa->end_tag = num_tags;
2114 tnfa->num_tags = num_tags;
2115 tnfa->num_minimals = num_minimals;
2118 xfree(saved_states);
2125 AST to TNFA compilation routines.
2131 } tre_copyast_symbol_t;
2133 /* Flags for tre_copy_ast(). */
2134 #define COPY_REMOVE_TAGS 1
2135 #define COPY_MAXIMIZE_FIRST_TAG 2
2137 static reg_errcode_t
2138 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2139 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2140 tre_ast_node_t **copy, int *max_pos)
2142 reg_errcode_t status = REG_OK;
2143 int bottom = tre_stack_num_objects(stack);
2146 tre_ast_node_t **result = copy;
2147 tre_copyast_symbol_t symbol;
2149 STACK_PUSH(stack, voidptr, ast);
2150 STACK_PUSH(stack, int, COPY_RECURSE);
2152 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2154 tre_ast_node_t *node;
2155 if (status != REG_OK)
2158 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2161 case COPY_SET_RESULT_PTR:
2162 result = tre_stack_pop_voidptr(stack);
2165 node = tre_stack_pop_voidptr(stack);
2170 tre_literal_t *lit = node->obj;
2171 int pos = lit->position;
2172 int min = lit->code_min;
2173 int max = lit->code_max;
2174 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2176 /* XXX - e.g. [ab] has only one position but two
2177 nodes, so we are creating holes in the state space
2178 here. Not fatal, just wastes memory. */
2182 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2184 /* Change this tag to empty. */
2188 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2191 /* Maximize the first tag. */
2192 tag_directions[max] = TRE_TAG_MAXIMIZE;
2195 *result = tre_ast_new_literal(mem, min, max, pos);
2196 if (*result == NULL)
2197 status = REG_ESPACE;
2205 tre_union_t *uni = node->obj;
2207 *result = tre_ast_new_union(mem, uni->left, uni->right);
2208 if (*result == NULL)
2210 status = REG_ESPACE;
2213 tmp = (*result)->obj;
2214 result = &tmp->left;
2215 STACK_PUSHX(stack, voidptr, uni->right);
2216 STACK_PUSHX(stack, int, COPY_RECURSE);
2217 STACK_PUSHX(stack, voidptr, &tmp->right);
2218 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2219 STACK_PUSHX(stack, voidptr, uni->left);
2220 STACK_PUSHX(stack, int, COPY_RECURSE);
2225 tre_catenation_t *cat = node->obj;
2226 tre_catenation_t *tmp;
2227 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2228 if (*result == NULL)
2230 status = REG_ESPACE;
2233 tmp = (*result)->obj;
2236 result = &tmp->left;
2238 STACK_PUSHX(stack, voidptr, cat->right);
2239 STACK_PUSHX(stack, int, COPY_RECURSE);
2240 STACK_PUSHX(stack, voidptr, &tmp->right);
2241 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2242 STACK_PUSHX(stack, voidptr, cat->left);
2243 STACK_PUSHX(stack, int, COPY_RECURSE);
2248 tre_iteration_t *iter = node->obj;
2249 STACK_PUSHX(stack, voidptr, iter->arg);
2250 STACK_PUSHX(stack, int, COPY_RECURSE);
2251 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2252 iter->max, iter->minimal);
2253 if (*result == NULL)
2255 status = REG_ESPACE;
2258 iter = (*result)->obj;
2259 result = &iter->arg;
2269 *pos_add += num_copied;
2276 } tre_expand_ast_symbol_t;
2278 /* Expands each iteration node that has a finite nonzero minimum or maximum
2279 iteration count to a catenated sequence of copies of the node. */
2280 static reg_errcode_t
2281 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2282 int *position, tre_tag_direction_t *tag_directions,
2285 reg_errcode_t status = REG_OK;
2286 int bottom = tre_stack_num_objects(stack);
2288 int pos_add_total = 0;
2292 STACK_PUSHR(stack, voidptr, ast);
2293 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2294 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2296 tre_ast_node_t *node;
2297 tre_expand_ast_symbol_t symbol;
2299 if (status != REG_OK)
2302 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2303 node = tre_stack_pop_voidptr(stack);
2306 case EXPAND_RECURSE:
2311 tre_literal_t *lit= node->obj;
2312 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2314 lit->position += pos_add;
2315 if (lit->position > max_pos)
2316 max_pos = lit->position;
2322 tre_union_t *uni = node->obj;
2323 STACK_PUSHX(stack, voidptr, uni->right);
2324 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2325 STACK_PUSHX(stack, voidptr, uni->left);
2326 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2331 tre_catenation_t *cat = node->obj;
2332 STACK_PUSHX(stack, voidptr, cat->right);
2333 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2334 STACK_PUSHX(stack, voidptr, cat->left);
2335 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2340 tre_iteration_t *iter = node->obj;
2341 STACK_PUSHX(stack, int, pos_add);
2342 STACK_PUSHX(stack, voidptr, node);
2343 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2344 STACK_PUSHX(stack, voidptr, iter->arg);
2345 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2346 /* If we are going to expand this node at EXPAND_AFTER_ITER
2347 then don't increase the `pos' fields of the nodes now, it
2348 will get done when expanding. */
2349 if (iter->min > 1 || iter->max > 1)
2359 case EXPAND_AFTER_ITER:
2361 tre_iteration_t *iter = node->obj;
2363 pos_add = tre_stack_pop_int(stack);
2364 pos_add_last = pos_add;
2365 if (iter->min > 1 || iter->max > 1)
2367 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2369 int pos_add_save = pos_add;
2371 /* Create a catenated sequence of copies of the node. */
2372 for (j = 0; j < iter->min; j++)
2374 tre_ast_node_t *copy;
2375 /* Remove tags from all but the last copy. */
2376 int flags = ((j + 1 < iter->min)
2378 : COPY_MAXIMIZE_FIRST_TAG);
2379 pos_add_save = pos_add;
2380 status = tre_copy_ast(mem, stack, iter->arg, flags,
2381 &pos_add, tag_directions, ©,
2383 if (status != REG_OK)
2386 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2393 if (iter->max == -1)
2395 /* No upper limit. */
2396 pos_add_save = pos_add;
2397 status = tre_copy_ast(mem, stack, iter->arg, 0,
2398 &pos_add, NULL, &seq2, &max_pos);
2399 if (status != REG_OK)
2401 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2407 for (j = iter->min; j < iter->max; j++)
2409 tre_ast_node_t *tmp, *copy;
2410 pos_add_save = pos_add;
2411 status = tre_copy_ast(mem, stack, iter->arg, 0,
2412 &pos_add, NULL, ©, &max_pos);
2413 if (status != REG_OK)
2416 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2421 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2424 seq2 = tre_ast_new_union(mem, tmp, seq2);
2430 pos_add = pos_add_save;
2433 else if (seq2 != NULL)
2434 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2437 node->obj = seq1->obj;
2438 node->type = seq1->type;
2442 pos_add_total += pos_add - pos_add_last;
2443 if (iter_depth == 0)
2444 pos_add = pos_add_total;
2454 *position += pos_add_total;
2456 /* `max_pos' should never be larger than `*position' if the above
2457 code works, but just an extra safeguard let's make sure
2458 `*position' is set large enough so enough memory will be
2459 allocated for the transition table. */
2460 if (max_pos > *position)
2461 *position = max_pos;
2466 static tre_pos_and_tags_t *
2467 tre_set_empty(tre_mem_t mem)
2469 tre_pos_and_tags_t *new_set;
2471 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2472 if (new_set == NULL)
2475 new_set[0].position = -1;
2476 new_set[0].code_min = -1;
2477 new_set[0].code_max = -1;
2482 static tre_pos_and_tags_t *
2483 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2484 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2486 tre_pos_and_tags_t *new_set;
2488 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2489 if (new_set == NULL)
2492 new_set[0].position = position;
2493 new_set[0].code_min = code_min;
2494 new_set[0].code_max = code_max;
2495 new_set[0].class = class;
2496 new_set[0].neg_classes = neg_classes;
2497 new_set[0].backref = backref;
2498 new_set[1].position = -1;
2499 new_set[1].code_min = -1;
2500 new_set[1].code_max = -1;
2505 static tre_pos_and_tags_t *
2506 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2507 int *tags, int assertions)
2510 tre_pos_and_tags_t *new_set;
2514 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2515 for (s1 = 0; set1[s1].position >= 0; s1++);
2516 for (s2 = 0; set2[s2].position >= 0; s2++);
2517 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2521 for (s1 = 0; set1[s1].position >= 0; s1++)
2523 new_set[s1].position = set1[s1].position;
2524 new_set[s1].code_min = set1[s1].code_min;
2525 new_set[s1].code_max = set1[s1].code_max;
2526 new_set[s1].assertions = set1[s1].assertions | assertions;
2527 new_set[s1].class = set1[s1].class;
2528 new_set[s1].neg_classes = set1[s1].neg_classes;
2529 new_set[s1].backref = set1[s1].backref;
2530 if (set1[s1].tags == NULL && tags == NULL)
2531 new_set[s1].tags = NULL;
2534 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2535 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2536 * (i + num_tags + 1)));
2537 if (new_tags == NULL)
2539 for (j = 0; j < i; j++)
2540 new_tags[j] = set1[s1].tags[j];
2541 for (i = 0; i < num_tags; i++)
2542 new_tags[j + i] = tags[i];
2543 new_tags[j + i] = -1;
2544 new_set[s1].tags = new_tags;
2548 for (s2 = 0; set2[s2].position >= 0; s2++)
2550 new_set[s1 + s2].position = set2[s2].position;
2551 new_set[s1 + s2].code_min = set2[s2].code_min;
2552 new_set[s1 + s2].code_max = set2[s2].code_max;
2553 /* XXX - why not | assertions here as well? */
2554 new_set[s1 + s2].assertions = set2[s2].assertions;
2555 new_set[s1 + s2].class = set2[s2].class;
2556 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2557 new_set[s1 + s2].backref = set2[s2].backref;
2558 if (set2[s2].tags == NULL)
2559 new_set[s1 + s2].tags = NULL;
2562 for (i = 0; set2[s2].tags[i] >= 0; i++);
2563 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2564 if (new_tags == NULL)
2566 for (j = 0; j < i; j++)
2567 new_tags[j] = set2[s2].tags[j];
2569 new_set[s1 + s2].tags = new_tags;
2572 new_set[s1 + s2].position = -1;
2576 /* Finds the empty path through `node' which is the one that should be
2577 taken according to POSIX.2 rules, and adds the tags on that path to
2578 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2579 set to the number of tags seen on the path. */
2580 static reg_errcode_t
2581 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2582 int *assertions, int *params, int *num_tags_seen,
2587 tre_catenation_t *cat;
2588 tre_iteration_t *iter;
2590 int bottom = tre_stack_num_objects(stack);
2591 reg_errcode_t status = REG_OK;
2595 status = tre_stack_push_voidptr(stack, node);
2597 /* Walk through the tree recursively. */
2598 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2600 node = tre_stack_pop_voidptr(stack);
2605 lit = (tre_literal_t *)node->obj;
2606 switch (lit->code_min)
2609 if (lit->code_max >= 0)
2613 /* Add the tag to `tags'. */
2614 for (i = 0; tags[i] >= 0; i++)
2615 if (tags[i] == lit->code_max)
2619 tags[i] = lit->code_max;
2628 assert(lit->code_max >= 1
2629 || lit->code_max <= ASSERT_LAST);
2630 if (assertions != NULL)
2631 *assertions |= lit->code_max;
2642 /* Subexpressions starting earlier take priority over ones
2643 starting later, so we prefer the left subexpression over the
2644 right subexpression. */
2645 uni = (tre_union_t *)node->obj;
2646 if (uni->left->nullable)
2647 STACK_PUSHX(stack, voidptr, uni->left)
2648 else if (uni->right->nullable)
2649 STACK_PUSHX(stack, voidptr, uni->right)
2655 /* The path must go through both children. */
2656 cat = (tre_catenation_t *)node->obj;
2657 assert(cat->left->nullable);
2658 assert(cat->right->nullable);
2659 STACK_PUSHX(stack, voidptr, cat->left);
2660 STACK_PUSHX(stack, voidptr, cat->right);
2664 /* A match with an empty string is preferred over no match at
2665 all, so we go through the argument if possible. */
2666 iter = (tre_iteration_t *)node->obj;
2667 if (iter->arg->nullable)
2668 STACK_PUSHX(stack, voidptr, iter->arg);
2684 NFL_POST_CATENATION,
2686 } tre_nfl_stack_symbol_t;
2689 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2690 the nodes of the AST `tree'. */
2691 static reg_errcode_t
2692 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2694 int bottom = tre_stack_num_objects(stack);
2696 STACK_PUSHR(stack, voidptr, tree);
2697 STACK_PUSHR(stack, int, NFL_RECURSE);
2699 while (tre_stack_num_objects(stack) > bottom)
2701 tre_nfl_stack_symbol_t symbol;
2702 tre_ast_node_t *node;
2704 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2705 node = tre_stack_pop_voidptr(stack);
2713 tre_literal_t *lit = (tre_literal_t *)node->obj;
2714 if (IS_BACKREF(lit))
2716 /* Back references: nullable = false, firstpos = {i},
2719 node->firstpos = tre_set_one(mem, lit->position, 0,
2720 TRE_CHAR_MAX, 0, NULL, -1);
2721 if (!node->firstpos)
2723 node->lastpos = tre_set_one(mem, lit->position, 0,
2724 TRE_CHAR_MAX, 0, NULL,
2725 (int)lit->code_max);
2729 else if (lit->code_min < 0)
2731 /* Tags, empty strings, params, and zero width assertions:
2732 nullable = true, firstpos = {}, and lastpos = {}. */
2734 node->firstpos = tre_set_empty(mem);
2735 if (!node->firstpos)
2737 node->lastpos = tre_set_empty(mem);
2743 /* Literal at position i: nullable = false, firstpos = {i},
2747 tre_set_one(mem, lit->position, (int)lit->code_min,
2748 (int)lit->code_max, 0, NULL, -1);
2749 if (!node->firstpos)
2751 node->lastpos = tre_set_one(mem, lit->position,
2754 lit->u.class, lit->neg_classes,
2763 /* Compute the attributes for the two subtrees, and after that
2765 STACK_PUSHR(stack, voidptr, node);
2766 STACK_PUSHR(stack, int, NFL_POST_UNION);
2767 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2768 STACK_PUSHR(stack, int, NFL_RECURSE);
2769 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2770 STACK_PUSHR(stack, int, NFL_RECURSE);
2774 /* Compute the attributes for the two subtrees, and after that
2776 STACK_PUSHR(stack, voidptr, node);
2777 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2778 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2779 STACK_PUSHR(stack, int, NFL_RECURSE);
2780 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2781 STACK_PUSHR(stack, int, NFL_RECURSE);
2785 /* Compute the attributes for the subtree, and after that for
2787 STACK_PUSHR(stack, voidptr, node);
2788 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2789 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2790 STACK_PUSHR(stack, int, NFL_RECURSE);
2793 break; /* end case: NFL_RECURSE */
2795 case NFL_POST_UNION:
2797 tre_union_t *uni = (tre_union_t *)node->obj;
2798 node->nullable = uni->left->nullable || uni->right->nullable;
2799 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2800 uni->right->firstpos, NULL, 0);
2801 if (!node->firstpos)
2803 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2804 uni->right->lastpos, NULL, 0);
2810 case NFL_POST_ITERATION:
2812 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2814 if (iter->min == 0 || iter->arg->nullable)
2818 node->firstpos = iter->arg->firstpos;
2819 node->lastpos = iter->arg->lastpos;
2823 case NFL_POST_CATENATION:
2825 int num_tags, *tags, assertions, params_seen;
2827 reg_errcode_t status;
2828 tre_catenation_t *cat = node->obj;
2829 node->nullable = cat->left->nullable && cat->right->nullable;
2831 /* Compute firstpos. */
2832 if (cat->left->nullable)
2834 /* The left side matches the empty string. Make a first pass
2835 with tre_match_empty() to get the number of tags and
2837 status = tre_match_empty(stack, cat->left,
2838 NULL, NULL, NULL, &num_tags,
2840 if (status != REG_OK)
2842 /* Allocate arrays for the tags and parameters. */
2843 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2848 /* Second pass with tre_mach_empty() to get the list of
2849 tags and parameters. */
2850 status = tre_match_empty(stack, cat->left, tags,
2851 &assertions, params, NULL, NULL);
2852 if (status != REG_OK)
2858 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2861 if (!node->firstpos)
2866 node->firstpos = cat->left->firstpos;
2869 /* Compute lastpos. */
2870 if (cat->right->nullable)
2872 /* The right side matches the empty string. Make a first pass
2873 with tre_match_empty() to get the number of tags and
2875 status = tre_match_empty(stack, cat->right,
2876 NULL, NULL, NULL, &num_tags,
2878 if (status != REG_OK)
2880 /* Allocate arrays for the tags and parameters. */
2881 tags = xmalloc(sizeof(int) * (num_tags + 1));
2886 /* Second pass with tre_mach_empty() to get the list of
2887 tags and parameters. */
2888 status = tre_match_empty(stack, cat->right, tags,
2889 &assertions, params, NULL, NULL);
2890 if (status != REG_OK)
2896 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2904 node->lastpos = cat->right->lastpos;
2919 /* Adds a transition from each position in `p1' to each position in `p2'. */
2920 static reg_errcode_t
2921 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2922 tre_tnfa_transition_t *transitions,
2923 int *counts, int *offs)
2925 tre_pos_and_tags_t *orig_p2 = p2;
2926 tre_tnfa_transition_t *trans;
2927 int i, j, k, l, dup, prev_p2_pos;
2929 if (transitions != NULL)
2930 while (p1->position >= 0)
2934 while (p2->position >= 0)
2936 /* Optimization: if this position was already handled, skip it. */
2937 if (p2->position == prev_p2_pos)
2942 prev_p2_pos = p2->position;
2943 /* Set `trans' to point to the next unused transition from
2944 position `p1->position'. */
2945 trans = transitions + offs[p1->position];
2946 while (trans->state != NULL)
2949 /* If we find a previous transition from `p1->position' to
2950 `p2->position', it is overwritten. This can happen only
2951 if there are nested loops in the regexp, like in "((a)*)*".
2952 In POSIX.2 repetition using the outer loop is always
2953 preferred over using the inner loop. Therefore the
2954 transition for the inner loop is useless and can be thrown
2956 /* XXX - The same position is used for all nodes in a bracket
2957 expression, so this optimization cannot be used (it will
2958 break bracket expressions) unless I figure out a way to
2960 if (trans->state_id == p2->position)
2968 if (trans->state == NULL)
2969 (trans + 1)->state = NULL;
2970 /* Use the character ranges, assertions, etc. from `p1' for
2971 the transition from `p1' to `p2'. */
2972 trans->code_min = p1->code_min;
2973 trans->code_max = p1->code_max;
2974 trans->state = transitions + offs[p2->position];
2975 trans->state_id = p2->position;
2976 trans->assertions = p1->assertions | p2->assertions
2977 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2978 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2979 if (p1->backref >= 0)
2981 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2982 assert(p2->backref < 0);
2983 trans->u.backref = p1->backref;
2984 trans->assertions |= ASSERT_BACKREF;
2987 trans->u.class = p1->class;
2988 if (p1->neg_classes != NULL)
2990 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2991 trans->neg_classes =
2992 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2993 if (trans->neg_classes == NULL)
2995 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2996 trans->neg_classes[i] = p1->neg_classes[i];
2997 trans->neg_classes[i] = (tre_ctype_t)0;
3000 trans->neg_classes = NULL;
3002 /* Find out how many tags this transition has. */
3004 if (p1->tags != NULL)
3005 while(p1->tags[i] >= 0)
3008 if (p2->tags != NULL)
3009 while(p2->tags[j] >= 0)
3012 /* If we are overwriting a transition, free the old tag array. */
3013 if (trans->tags != NULL)
3017 /* If there were any tags, allocate an array and fill it. */
3020 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
3024 if (p1->tags != NULL)
3025 while(p1->tags[i] >= 0)
3027 trans->tags[i] = p1->tags[i];
3032 if (p2->tags != NULL)
3033 while (p2->tags[j] >= 0)
3035 /* Don't add duplicates. */
3037 for (k = 0; k < i; k++)
3038 if (trans->tags[k] == p2->tags[j])
3044 trans->tags[l++] = p2->tags[j];
3047 trans->tags[l] = -1;
3055 /* Compute a maximum limit for the number of transitions leaving
3057 while (p1->position >= 0)
3060 while (p2->position >= 0)
3062 counts[p1->position]++;
3070 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3071 labelled with one character range (there are no transitions on empty
3072 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3074 static reg_errcode_t
3075 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3076 int *counts, int *offs)
3079 tre_catenation_t *cat;
3080 tre_iteration_t *iter;
3081 reg_errcode_t errcode = REG_OK;
3083 /* XXX - recurse using a stack!. */
3089 uni = (tre_union_t *)node->obj;
3090 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3091 if (errcode != REG_OK)
3093 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3097 cat = (tre_catenation_t *)node->obj;
3098 /* Add a transition from each position in cat->left->lastpos
3099 to each position in cat->right->firstpos. */
3100 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3101 transitions, counts, offs);
3102 if (errcode != REG_OK)
3104 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3105 if (errcode != REG_OK)
3107 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3111 iter = (tre_iteration_t *)node->obj;
3112 assert(iter->max == -1 || iter->max == 1);
3114 if (iter->max == -1)
3116 assert(iter->min == 0 || iter->min == 1);
3117 /* Add a transition from each last position in the iterated
3118 expression to each first position. */
3119 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3120 transitions, counts, offs);
3121 if (errcode != REG_OK)
3124 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3131 #define ERROR_EXIT(err) \
3135 if (/*CONSTCOND*/1) \
3138 while (/*CONSTCOND*/0)
3142 regcomp(regex_t *preg, const char *regex, int cflags)
3145 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3146 tre_pos_and_tags_t *p;
3147 int *counts = NULL, *offs = NULL;
3149 tre_tnfa_transition_t *transitions, *initial;
3150 tre_tnfa_t *tnfa = NULL;
3151 tre_submatch_data_t *submatch_data;
3152 tre_tag_direction_t *tag_directions = NULL;
3153 reg_errcode_t errcode;
3156 /* Parse context. */
3157 tre_parse_ctx_t parse_ctx;
3159 /* Allocate a stack used throughout the compilation process for various
3161 stack = tre_stack_new(512, 10240, 128);
3164 /* Allocate a fast memory allocator. */
3165 mem = tre_mem_new();
3168 tre_stack_destroy(stack);
3172 /* Parse the regexp. */
3173 memset(&parse_ctx, 0, sizeof(parse_ctx));
3174 parse_ctx.mem = mem;
3175 parse_ctx.stack = stack;
3176 parse_ctx.re = regex;
3177 parse_ctx.cflags = cflags;
3178 parse_ctx.max_backref = -1;
3179 errcode = tre_parse(&parse_ctx);
3180 if (errcode != REG_OK)
3181 ERROR_EXIT(errcode);
3182 preg->re_nsub = parse_ctx.submatch_id - 1;
3183 tree = parse_ctx.result;
3185 /* Back references and approximate matching cannot currently be used
3186 in the same regexp. */
3187 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3188 ERROR_EXIT(REG_BADPAT);
3191 tre_ast_print(tree);
3192 #endif /* TRE_DEBUG */
3194 /* Referring to nonexistent subexpressions is illegal. */
3195 if (parse_ctx.max_backref > (int)preg->re_nsub)
3196 ERROR_EXIT(REG_ESUBREG);
3198 /* Allocate the TNFA struct. */
3199 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3201 ERROR_EXIT(REG_ESPACE);
3202 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3203 tnfa->have_approx = parse_ctx.have_approx;
3204 tnfa->num_submatches = parse_ctx.submatch_id;
3206 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3207 regexp does not have back references, this can be skipped. */
3208 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3211 /* Figure out how many tags we will need. */
3212 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3213 if (errcode != REG_OK)
3214 ERROR_EXIT(errcode);
3216 if (tnfa->num_tags > 0)
3218 tag_directions = xmalloc(sizeof(*tag_directions)
3219 * (tnfa->num_tags + 1));
3220 if (tag_directions == NULL)
3221 ERROR_EXIT(REG_ESPACE);
3222 tnfa->tag_directions = tag_directions;
3223 memset(tag_directions, -1,
3224 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3226 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3227 sizeof(tnfa->minimal_tags));
3228 if (tnfa->minimal_tags == NULL)
3229 ERROR_EXIT(REG_ESPACE);
3231 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3232 sizeof(*submatch_data));
3233 if (submatch_data == NULL)
3234 ERROR_EXIT(REG_ESPACE);
3235 tnfa->submatch_data = submatch_data;
3237 errcode = tre_add_tags(mem, stack, tree, tnfa);
3238 if (errcode != REG_OK)
3239 ERROR_EXIT(errcode);
3243 /* Expand iteration nodes. */
3244 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3245 tag_directions, &tnfa->params_depth);
3246 if (errcode != REG_OK)
3247 ERROR_EXIT(errcode);
3249 /* Add a dummy node for the final state.
3250 XXX - For certain patterns this dummy node can be optimized away,
3251 for example "a*" or "ab*". Figure out a simple way to detect
3252 this possibility. */
3254 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3255 if (tmp_ast_r == NULL)
3256 ERROR_EXIT(REG_ESPACE);
3258 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3260 ERROR_EXIT(REG_ESPACE);
3262 errcode = tre_compute_nfl(mem, stack, tree);
3263 if (errcode != REG_OK)
3264 ERROR_EXIT(errcode);
3266 counts = xmalloc(sizeof(int) * parse_ctx.position);
3268 ERROR_EXIT(REG_ESPACE);
3270 offs = xmalloc(sizeof(int) * parse_ctx.position);
3272 ERROR_EXIT(REG_ESPACE);
3274 for (i = 0; i < parse_ctx.position; i++)
3276 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3279 for (i = 0; i < parse_ctx.position; i++)
3282 add += counts[i] + 1;
3285 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3286 if (transitions == NULL)
3287 ERROR_EXIT(REG_ESPACE);
3288 tnfa->transitions = transitions;
3289 tnfa->num_transitions = add;
3291 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3292 if (errcode != REG_OK)
3293 ERROR_EXIT(errcode);
3295 tnfa->firstpos_chars = NULL;
3299 while (p->position >= 0)
3305 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3306 if (initial == NULL)
3307 ERROR_EXIT(REG_ESPACE);
3308 tnfa->initial = initial;
3311 for (p = tree->firstpos; p->position >= 0; p++)
3313 initial[i].state = transitions + offs[p->position];
3314 initial[i].state_id = p->position;
3315 initial[i].tags = NULL;
3316 /* Copy the arrays p->tags, and p->params, they are allocated
3317 from a tre_mem object. */
3321 for (j = 0; p->tags[j] >= 0; j++);
3322 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3323 if (!initial[i].tags)
3324 ERROR_EXIT(REG_ESPACE);
3325 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3327 initial[i].assertions = p->assertions;
3330 initial[i].state = NULL;
3332 tnfa->num_transitions = add;
3333 tnfa->final = transitions + offs[tree->lastpos[0].position];
3334 tnfa->num_states = parse_ctx.position;
3335 tnfa->cflags = cflags;
3337 tre_mem_destroy(mem);
3338 tre_stack_destroy(stack);
3342 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3346 /* Free everything that was allocated and return the error code. */
3347 tre_mem_destroy(mem);
3349 tre_stack_destroy(stack);
3354 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3363 regfree(regex_t *preg)
3367 tre_tnfa_transition_t *trans;
3369 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3373 for (i = 0; i < tnfa->num_transitions; i++)
3374 if (tnfa->transitions[i].state)
3376 if (tnfa->transitions[i].tags)
3377 xfree(tnfa->transitions[i].tags);
3378 if (tnfa->transitions[i].neg_classes)
3379 xfree(tnfa->transitions[i].neg_classes);
3381 if (tnfa->transitions)
3382 xfree(tnfa->transitions);
3386 for (trans = tnfa->initial; trans->state; trans++)
3391 xfree(tnfa->initial);
3394 if (tnfa->submatch_data)
3396 for (i = 0; i < tnfa->num_submatches; i++)
3397 if (tnfa->submatch_data[i].parents)
3398 xfree(tnfa->submatch_data[i].parents);
3399 xfree(tnfa->submatch_data);
3402 if (tnfa->tag_directions)
3403 xfree(tnfa->tag_directions);
3404 if (tnfa->firstpos_chars)
3405 xfree(tnfa->firstpos_chars);
3406 if (tnfa->minimal_tags)
3407 xfree(tnfa->minimal_tags);