2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
967 if (!ctx->nofirstsub)
969 STACK_PUSH(stack, int, ctx->submatch_id);
970 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
973 STACK_PUSH(stack, int, PARSE_RE);
974 ctx->re_start = ctx->re;
977 /* The following is basically just a recursive descent parser. I use
978 an explicit stack instead of recursive functions mostly because of
979 two reasons: compatibility with systems which have an overflowable
980 call stack, and efficiency (both in lines of code and speed). */
981 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
983 if (status != REG_OK)
985 symbol = tre_stack_pop_int(stack);
989 /* Parse a full regexp. A regexp is one or more branches,
990 separated by the union operator `|'. */
991 if (ctx->cflags & REG_EXTENDED)
992 STACK_PUSHX(stack, int, PARSE_UNION);
993 STACK_PUSHX(stack, int, PARSE_BRANCH);
997 /* Parse a branch. A branch is one or more pieces, concatenated.
998 A piece is an atom possibly followed by a postfix operator. */
999 STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 STACK_PUSHX(stack, int, PARSE_PIECE);
1004 /* Parse a piece. A piece is an atom possibly followed by one
1005 or more postfix operators. */
1006 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1007 STACK_PUSHX(stack, int, PARSE_ATOM);
1010 case PARSE_CATENATION:
1011 /* If the expression has not ended, parse another piece. */
1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1019 if ((ctx->cflags & REG_EXTENDED
1020 && c == CHAR_RPAREN && depth > 0)
1021 || (!(ctx->cflags & REG_EXTENDED)
1022 && (c == CHAR_BACKSLASH
1023 && *(ctx->re + 1) == CHAR_RPAREN)))
1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 status = REG_EPAREN;
1028 if (!(ctx->cflags & REG_EXTENDED))
1034 /* Default case, left associative concatenation. */
1035 STACK_PUSHX(stack, int, PARSE_CATENATION);
1036 STACK_PUSHX(stack, voidptr, result);
1037 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1038 STACK_PUSHX(stack, int, PARSE_PIECE);
1043 case PARSE_POST_CATENATION:
1045 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1046 tre_ast_node_t *tmp_node;
1047 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1060 STACK_PUSHX(stack, int, PARSE_UNION);
1061 STACK_PUSHX(stack, voidptr, result);
1062 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1063 STACK_PUSHX(stack, int, PARSE_BRANCH);
1076 case PARSE_POST_UNION:
1078 tre_ast_node_t *tmp_node;
1079 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1080 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1088 /* Parse postfix operators. */
1094 case CHAR_QUESTIONMARK:
1095 if (!(ctx->cflags & REG_EXTENDED))
1100 tre_ast_node_t *tmp_node;
1105 if (*ctx->re == CHAR_PLUS)
1107 if (*ctx->re == CHAR_QUESTIONMARK)
1111 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1113 if (tmp_node == NULL)
1116 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1120 case CHAR_BACKSLASH:
1121 /* "\{" is special without REG_EXTENDED */
1122 if (!(ctx->cflags & REG_EXTENDED)
1123 && *(ctx->re + 1) == CHAR_LBRACE)
1132 /* "{" is literal without REG_EXTENDED */
1133 if (!(ctx->cflags & REG_EXTENDED))
1139 status = tre_parse_bound(ctx, &result);
1140 if (status != REG_OK)
1142 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1148 /* Parse an atom. An atom is a regular expression enclosed in `()',
1149 an empty set of `()', a bracket expression, `.', `^', `$',
1150 a `\' followed by a character, or a single character. */
1152 /* End of regexp? (empty string). */
1158 case CHAR_LPAREN: /* parenthesized subexpression */
1160 if (ctx->cflags & REG_EXTENDED)
1166 /* First parse a whole RE, then mark the resulting tree
1168 STACK_PUSHX(stack, int, ctx->submatch_id);
1169 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1170 STACK_PUSHX(stack, int, PARSE_RE);
1178 case CHAR_LBRACKET: /* bracket expression */
1180 status = tre_parse_bracket(ctx, &result);
1181 if (status != REG_OK)
1185 case CHAR_BACKSLASH:
1186 /* If this is "\(" or "\)" chew off the backslash and
1188 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1193 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1198 /* If a macro is used, parse the expanded macro recursively. */
1200 const char *buf = tre_expand_macro(ctx->re + 1);
1203 tre_parse_ctx_t subctx;
1204 memcpy(&subctx, ctx, sizeof(subctx));
1206 subctx.nofirstsub = 1;
1207 status = tre_parse(&subctx);
1208 if (status != REG_OK)
1211 ctx->position = subctx.position;
1212 result = subctx.result;
1218 /* Trailing backslash. */
1225 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1230 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1231 ASSERT_AT_WB_NEG, -1);
1235 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1240 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1246 if (ctx->re[0] != CHAR_LBRACE)
1248 /* 8 bit hex char. */
1249 char tmp[3] = {0, 0, 0};
1252 if (tre_isxdigit(ctx->re[0]))
1254 tmp[0] = (char)ctx->re[0];
1257 if (tre_isxdigit(ctx->re[0]))
1259 tmp[1] = (char)ctx->re[0];
1262 val = strtol(tmp, NULL, 16);
1263 result = tre_ast_new_literal(ctx->mem, (int)val,
1264 (int)val, ctx->position);
1275 while (*ctx->re && i < sizeof tmp)
1277 if (ctx->re[0] == CHAR_RBRACE)
1279 if (tre_isxdigit(ctx->re[0]))
1281 tmp[i] = (char)ctx->re[0];
1290 val = strtol(tmp, NULL, 16);
1291 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1299 if (tre_isdigit(*ctx->re))
1301 /* Back reference. */
1302 int val = *ctx->re - '0';
1303 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1308 ctx->max_backref = MAX(val, ctx->max_backref);
1313 /* Escaped character. */
1314 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1325 case CHAR_PERIOD: /* the any-symbol */
1326 if (ctx->cflags & REG_NEWLINE)
1328 tre_ast_node_t *tmp1;
1329 tre_ast_node_t *tmp2;
1330 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1334 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1338 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1345 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1354 case CHAR_CARET: /* beginning of line assertion */
1355 /* '^' has a special meaning everywhere in EREs, and at
1356 beginning of BRE. */
1357 if (ctx->cflags & REG_EXTENDED
1358 || ctx->re == ctx->re_start)
1360 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1370 case CHAR_DOLLAR: /* end of line assertion. */
1371 /* '$' is special everywhere in EREs, and in the end of the
1373 if (ctx->cflags & REG_EXTENDED
1376 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1393 case CHAR_QUESTIONMARK:
1394 if (!(ctx->cflags & REG_EXTENDED))
1398 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1406 clen = mbtowc(&wc, ctx->re, -1);
1407 if (clen<0) clen=1, wc=WEOF;
1409 /* Note that we can't use an tre_isalpha() test here, since there
1410 may be characters which are alphabetic but neither upper or
1412 if (ctx->cflags & REG_ICASE
1413 && (tre_isupper(wc) || tre_islower(wc)))
1415 tre_ast_node_t *tmp1;
1416 tre_ast_node_t *tmp2;
1418 /* XXX - Can there be more than one opposite-case
1419 counterpoints for some character in some locale? Or
1420 more than two characters which all should be regarded
1421 the same character if case is ignored? If yes, there
1422 does not seem to be a portable way to detect it. I guess
1423 that at least for multi-character collating elements there
1424 could be several opposite-case counterpoints, but they
1425 cannot be supported portably anyway. */
1426 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1431 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1436 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1442 result = tre_ast_new_literal(ctx->mem, wc, wc,
1453 case PARSE_MARK_FOR_SUBMATCH:
1455 int submatch_id = tre_stack_pop_int(stack);
1457 if (result->submatch_id >= 0)
1459 tre_ast_node_t *n, *tmp_node;
1460 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1463 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1464 if (tmp_node == NULL)
1466 tmp_node->num_submatches = result->num_submatches;
1469 result->submatch_id = submatch_id;
1470 result->num_submatches++;
1474 case PARSE_RESTORE_CFLAGS:
1475 ctx->cflags = tre_stack_pop_int(stack);
1484 /* Check for missing closing parentheses. */
1488 if (status == REG_OK)
1489 ctx->result = result;
1496 /***********************************************************************
1498 ***********************************************************************/
1503 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1508 Algorithms to setup tags so that submatch addressing can be done.
1512 /* Inserts a catenation node to the root of the tree given in `node'.
1513 As the left child a new tag with number `tag_id' to `node' is added,
1514 and the right child is the old root. */
1515 static reg_errcode_t
1516 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1518 tre_catenation_t *c;
1520 c = tre_mem_alloc(mem, sizeof(*c));
1523 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1524 if (c->left == NULL)
1526 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1527 if (c->right == NULL)
1530 c->right->obj = node->obj;
1531 c->right->type = node->type;
1532 c->right->nullable = -1;
1533 c->right->submatch_id = -1;
1534 c->right->firstpos = NULL;
1535 c->right->lastpos = NULL;
1536 c->right->num_tags = 0;
1538 node->type = CATENATION;
1542 /* Inserts a catenation node to the root of the tree given in `node'.
1543 As the right child a new tag with number `tag_id' to `node' is added,
1544 and the left child is the old root. */
1545 static reg_errcode_t
1546 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1548 tre_catenation_t *c;
1550 c = tre_mem_alloc(mem, sizeof(*c));
1553 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1554 if (c->right == NULL)
1556 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1557 if (c->left == NULL)
1560 c->left->obj = node->obj;
1561 c->left->type = node->type;
1562 c->left->nullable = -1;
1563 c->left->submatch_id = -1;
1564 c->left->firstpos = NULL;
1565 c->left->lastpos = NULL;
1566 c->left->num_tags = 0;
1568 node->type = CATENATION;
1574 ADDTAGS_AFTER_ITERATION,
1575 ADDTAGS_AFTER_UNION_LEFT,
1576 ADDTAGS_AFTER_UNION_RIGHT,
1577 ADDTAGS_AFTER_CAT_LEFT,
1578 ADDTAGS_AFTER_CAT_RIGHT,
1579 ADDTAGS_SET_SUBMATCH_END
1580 } tre_addtags_symbol_t;
1589 /* Go through `regset' and set submatch data for submatches that are
1592 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1596 for (i = 0; regset[i] >= 0; i++)
1598 int id = regset[i] / 2;
1599 int start = !(regset[i] % 2);
1601 tnfa->submatch_data[id].so_tag = tag;
1603 tnfa->submatch_data[id].eo_tag = tag;
1609 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1610 subexpressions marked for submatch addressing can be traced. */
1611 static reg_errcode_t
1612 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1615 reg_errcode_t status = REG_OK;
1616 tre_addtags_symbol_t symbol;
1617 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1618 int bottom = tre_stack_num_objects(stack);
1619 /* True for first pass (counting number of needed tags) */
1620 int first_pass = (mem == NULL || tnfa == NULL);
1621 int *regset, *orig_regset;
1622 int num_tags = 0; /* Total number of tags. */
1623 int num_minimals = 0; /* Number of special minimal tags. */
1624 int tag = 0; /* The tag that is to be added next. */
1625 int next_tag = 1; /* Next tag to use after this one. */
1626 int *parents; /* Stack of submatches the current submatch is
1628 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1629 tre_tag_states_t *saved_states;
1631 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1635 tnfa->minimal_tags[0] = -1;
1638 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1642 orig_regset = regset;
1644 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1645 if (parents == NULL)
1652 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1653 if (saved_states == NULL)
1662 for (i = 0; i <= tnfa->num_submatches; i++)
1663 saved_states[i].tag = -1;
1666 STACK_PUSH(stack, voidptr, node);
1667 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1669 while (tre_stack_num_objects(stack) > bottom)
1671 if (status != REG_OK)
1674 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1678 case ADDTAGS_SET_SUBMATCH_END:
1680 int id = tre_stack_pop_int(stack);
1683 /* Add end of this submatch to regset. */
1684 for (i = 0; regset[i] >= 0; i++);
1685 regset[i] = id * 2 + 1;
1688 /* Pop this submatch from the parents stack. */
1689 for (i = 0; parents[i] >= 0; i++);
1690 parents[i - 1] = -1;
1694 case ADDTAGS_RECURSE:
1695 node = tre_stack_pop_voidptr(stack);
1697 if (node->submatch_id >= 0)
1699 int id = node->submatch_id;
1703 /* Add start of this submatch to regset. */
1704 for (i = 0; regset[i] >= 0; i++);
1710 for (i = 0; parents[i] >= 0; i++);
1711 tnfa->submatch_data[id].parents = NULL;
1714 int *p = xmalloc(sizeof(*p) * (i + 1));
1717 status = REG_ESPACE;
1720 assert(tnfa->submatch_data[id].parents == NULL);
1721 tnfa->submatch_data[id].parents = p;
1722 for (i = 0; parents[i] >= 0; i++)
1728 /* Add end of this submatch to regset after processing this
1730 STACK_PUSHX(stack, int, node->submatch_id);
1731 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1738 tre_literal_t *lit = node->obj;
1740 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1745 /* Regset is not empty, so add a tag before the
1746 literal or backref. */
1749 status = tre_add_tag_left(mem, node, tag);
1750 tnfa->tag_directions[tag] = direction;
1751 if (minimal_tag >= 0)
1753 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1754 tnfa->minimal_tags[i] = tag;
1755 tnfa->minimal_tags[i + 1] = minimal_tag;
1756 tnfa->minimal_tags[i + 2] = -1;
1760 tre_purge_regset(regset, tnfa, tag);
1775 assert(!IS_TAG(lit));
1781 tre_catenation_t *cat = node->obj;
1782 tre_ast_node_t *left = cat->left;
1783 tre_ast_node_t *right = cat->right;
1784 int reserved_tag = -1;
1787 /* After processing right child. */
1788 STACK_PUSHX(stack, voidptr, node);
1789 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1791 /* Process right child. */
1792 STACK_PUSHX(stack, voidptr, right);
1793 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1795 /* After processing left child. */
1796 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1797 if (left->num_tags > 0 && right->num_tags > 0)
1799 /* Reserve the next tag to the right child. */
1800 reserved_tag = next_tag;
1803 STACK_PUSHX(stack, int, reserved_tag);
1804 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1806 /* Process left child. */
1807 STACK_PUSHX(stack, voidptr, left);
1808 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1814 tre_iteration_t *iter = node->obj;
1818 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1822 STACK_PUSHX(stack, int, tag);
1823 STACK_PUSHX(stack, int, iter->minimal);
1825 STACK_PUSHX(stack, voidptr, node);
1826 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1828 STACK_PUSHX(stack, voidptr, iter->arg);
1829 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1831 /* Regset is not empty, so add a tag here. */
1832 if (regset[0] >= 0 || iter->minimal)
1837 status = tre_add_tag_left(mem, node, tag);
1839 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1841 tnfa->tag_directions[tag] = direction;
1842 if (minimal_tag >= 0)
1844 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1845 tnfa->minimal_tags[i] = tag;
1846 tnfa->minimal_tags[i + 1] = minimal_tag;
1847 tnfa->minimal_tags[i + 2] = -1;
1851 tre_purge_regset(regset, tnfa, tag);
1859 direction = TRE_TAG_MINIMIZE;
1864 tre_union_t *uni = node->obj;
1865 tre_ast_node_t *left = uni->left;
1866 tre_ast_node_t *right = uni->right;
1872 left_tag = next_tag;
1873 right_tag = next_tag + 1;
1878 right_tag = next_tag;
1881 /* After processing right child. */
1882 STACK_PUSHX(stack, int, right_tag);
1883 STACK_PUSHX(stack, int, left_tag);
1884 STACK_PUSHX(stack, voidptr, regset);
1885 STACK_PUSHX(stack, int, regset[0] >= 0);
1886 STACK_PUSHX(stack, voidptr, node);
1887 STACK_PUSHX(stack, voidptr, right);
1888 STACK_PUSHX(stack, voidptr, left);
1889 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1891 /* Process right child. */
1892 STACK_PUSHX(stack, voidptr, right);
1893 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1895 /* After processing left child. */
1896 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1898 /* Process left child. */
1899 STACK_PUSHX(stack, voidptr, left);
1900 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1902 /* Regset is not empty, so add a tag here. */
1908 status = tre_add_tag_left(mem, node, tag);
1909 tnfa->tag_directions[tag] = direction;
1910 if (minimal_tag >= 0)
1912 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1913 tnfa->minimal_tags[i] = tag;
1914 tnfa->minimal_tags[i + 1] = minimal_tag;
1915 tnfa->minimal_tags[i + 2] = -1;
1919 tre_purge_regset(regset, tnfa, tag);
1928 if (node->num_submatches > 0)
1930 /* The next two tags are reserved for markers. */
1940 if (node->submatch_id >= 0)
1943 /* Push this submatch on the parents stack. */
1944 for (i = 0; parents[i] >= 0; i++);
1945 parents[i] = node->submatch_id;
1946 parents[i + 1] = -1;
1949 break; /* end case: ADDTAGS_RECURSE */
1951 case ADDTAGS_AFTER_ITERATION:
1955 node = tre_stack_pop_voidptr(stack);
1958 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1959 + tre_stack_pop_int(stack);
1964 minimal = tre_stack_pop_int(stack);
1965 enter_tag = tre_stack_pop_int(stack);
1967 minimal_tag = enter_tag;
1973 direction = TRE_TAG_MINIMIZE;
1975 direction = TRE_TAG_MAXIMIZE;
1980 case ADDTAGS_AFTER_CAT_LEFT:
1982 int new_tag = tre_stack_pop_int(stack);
1983 next_tag = tre_stack_pop_int(stack);
1991 case ADDTAGS_AFTER_CAT_RIGHT:
1992 node = tre_stack_pop_voidptr(stack);
1994 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1995 + ((tre_catenation_t *)node->obj)->right->num_tags;
1998 case ADDTAGS_AFTER_UNION_LEFT:
1999 /* Lift the bottom of the `regset' array so that when processing
2000 the right operand the items currently in the array are
2001 invisible. The original bottom was saved at ADDTAGS_UNION and
2002 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2003 while (*regset >= 0)
2007 case ADDTAGS_AFTER_UNION_RIGHT:
2009 int added_tags, tag_left, tag_right;
2010 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2011 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2012 node = tre_stack_pop_voidptr(stack);
2013 added_tags = tre_stack_pop_int(stack);
2016 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2017 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2018 + ((node->num_submatches > 0) ? 2 : 0);
2020 regset = tre_stack_pop_voidptr(stack);
2021 tag_left = tre_stack_pop_int(stack);
2022 tag_right = tre_stack_pop_int(stack);
2024 /* Add tags after both children, the left child gets a smaller
2025 tag than the right child. This guarantees that we prefer
2026 the left child over the right child. */
2027 /* XXX - This is not always necessary (if the children have
2028 tags which must be seen for every match of that child). */
2029 /* XXX - Check if this is the only place where tre_add_tag_right
2030 is used. If so, use tre_add_tag_left (putting the tag before
2031 the child as opposed after the child) and throw away
2032 tre_add_tag_right. */
2033 if (node->num_submatches > 0)
2037 status = tre_add_tag_right(mem, left, tag_left);
2038 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2039 status = tre_add_tag_right(mem, right, tag_right);
2040 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2044 direction = TRE_TAG_MAXIMIZE;
2052 } /* end switch(symbol) */
2053 } /* end while(tre_stack_num_objects(stack) > bottom) */
2056 tre_purge_regset(regset, tnfa, tag);
2058 if (!first_pass && minimal_tag >= 0)
2061 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2062 tnfa->minimal_tags[i] = tag;
2063 tnfa->minimal_tags[i + 1] = minimal_tag;
2064 tnfa->minimal_tags[i + 2] = -1;
2069 assert(tree->num_tags == num_tags);
2070 tnfa->end_tag = num_tags;
2071 tnfa->num_tags = num_tags;
2072 tnfa->num_minimals = num_minimals;
2075 xfree(saved_states);
2082 AST to TNFA compilation routines.
2088 } tre_copyast_symbol_t;
2090 /* Flags for tre_copy_ast(). */
2091 #define COPY_REMOVE_TAGS 1
2092 #define COPY_MAXIMIZE_FIRST_TAG 2
2094 static reg_errcode_t
2095 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2096 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2097 tre_ast_node_t **copy, int *max_pos)
2099 reg_errcode_t status = REG_OK;
2100 int bottom = tre_stack_num_objects(stack);
2103 tre_ast_node_t **result = copy;
2104 tre_copyast_symbol_t symbol;
2106 STACK_PUSH(stack, voidptr, ast);
2107 STACK_PUSH(stack, int, COPY_RECURSE);
2109 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2111 tre_ast_node_t *node;
2112 if (status != REG_OK)
2115 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2118 case COPY_SET_RESULT_PTR:
2119 result = tre_stack_pop_voidptr(stack);
2122 node = tre_stack_pop_voidptr(stack);
2127 tre_literal_t *lit = node->obj;
2128 int pos = lit->position;
2129 int min = lit->code_min;
2130 int max = lit->code_max;
2131 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2133 /* XXX - e.g. [ab] has only one position but two
2134 nodes, so we are creating holes in the state space
2135 here. Not fatal, just wastes memory. */
2139 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2141 /* Change this tag to empty. */
2145 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2148 /* Maximize the first tag. */
2149 tag_directions[max] = TRE_TAG_MAXIMIZE;
2152 *result = tre_ast_new_literal(mem, min, max, pos);
2153 if (*result == NULL)
2154 status = REG_ESPACE;
2162 tre_union_t *uni = node->obj;
2164 *result = tre_ast_new_union(mem, uni->left, uni->right);
2165 if (*result == NULL)
2167 status = REG_ESPACE;
2170 tmp = (*result)->obj;
2171 result = &tmp->left;
2172 STACK_PUSHX(stack, voidptr, uni->right);
2173 STACK_PUSHX(stack, int, COPY_RECURSE);
2174 STACK_PUSHX(stack, voidptr, &tmp->right);
2175 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2176 STACK_PUSHX(stack, voidptr, uni->left);
2177 STACK_PUSHX(stack, int, COPY_RECURSE);
2182 tre_catenation_t *cat = node->obj;
2183 tre_catenation_t *tmp;
2184 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2185 if (*result == NULL)
2187 status = REG_ESPACE;
2190 tmp = (*result)->obj;
2193 result = &tmp->left;
2195 STACK_PUSHX(stack, voidptr, cat->right);
2196 STACK_PUSHX(stack, int, COPY_RECURSE);
2197 STACK_PUSHX(stack, voidptr, &tmp->right);
2198 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2199 STACK_PUSHX(stack, voidptr, cat->left);
2200 STACK_PUSHX(stack, int, COPY_RECURSE);
2205 tre_iteration_t *iter = node->obj;
2206 STACK_PUSHX(stack, voidptr, iter->arg);
2207 STACK_PUSHX(stack, int, COPY_RECURSE);
2208 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2209 iter->max, iter->minimal);
2210 if (*result == NULL)
2212 status = REG_ESPACE;
2215 iter = (*result)->obj;
2216 result = &iter->arg;
2226 *pos_add += num_copied;
2233 } tre_expand_ast_symbol_t;
2235 /* Expands each iteration node that has a finite nonzero minimum or maximum
2236 iteration count to a catenated sequence of copies of the node. */
2237 static reg_errcode_t
2238 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2239 int *position, tre_tag_direction_t *tag_directions,
2242 reg_errcode_t status = REG_OK;
2243 int bottom = tre_stack_num_objects(stack);
2245 int pos_add_total = 0;
2249 STACK_PUSHR(stack, voidptr, ast);
2250 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2251 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2253 tre_ast_node_t *node;
2254 tre_expand_ast_symbol_t symbol;
2256 if (status != REG_OK)
2259 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2260 node = tre_stack_pop_voidptr(stack);
2263 case EXPAND_RECURSE:
2268 tre_literal_t *lit= node->obj;
2269 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2271 lit->position += pos_add;
2272 if (lit->position > max_pos)
2273 max_pos = lit->position;
2279 tre_union_t *uni = node->obj;
2280 STACK_PUSHX(stack, voidptr, uni->right);
2281 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2282 STACK_PUSHX(stack, voidptr, uni->left);
2283 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2288 tre_catenation_t *cat = node->obj;
2289 STACK_PUSHX(stack, voidptr, cat->right);
2290 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2291 STACK_PUSHX(stack, voidptr, cat->left);
2292 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2297 tre_iteration_t *iter = node->obj;
2298 STACK_PUSHX(stack, int, pos_add);
2299 STACK_PUSHX(stack, voidptr, node);
2300 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2301 STACK_PUSHX(stack, voidptr, iter->arg);
2302 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2303 /* If we are going to expand this node at EXPAND_AFTER_ITER
2304 then don't increase the `pos' fields of the nodes now, it
2305 will get done when expanding. */
2306 if (iter->min > 1 || iter->max > 1)
2316 case EXPAND_AFTER_ITER:
2318 tre_iteration_t *iter = node->obj;
2320 pos_add = tre_stack_pop_int(stack);
2321 pos_add_last = pos_add;
2322 if (iter->min > 1 || iter->max > 1)
2324 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2326 int pos_add_save = pos_add;
2328 /* Create a catenated sequence of copies of the node. */
2329 for (j = 0; j < iter->min; j++)
2331 tre_ast_node_t *copy;
2332 /* Remove tags from all but the last copy. */
2333 int flags = ((j + 1 < iter->min)
2335 : COPY_MAXIMIZE_FIRST_TAG);
2336 pos_add_save = pos_add;
2337 status = tre_copy_ast(mem, stack, iter->arg, flags,
2338 &pos_add, tag_directions, ©,
2340 if (status != REG_OK)
2343 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2350 if (iter->max == -1)
2352 /* No upper limit. */
2353 pos_add_save = pos_add;
2354 status = tre_copy_ast(mem, stack, iter->arg, 0,
2355 &pos_add, NULL, &seq2, &max_pos);
2356 if (status != REG_OK)
2358 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2364 for (j = iter->min; j < iter->max; j++)
2366 tre_ast_node_t *tmp, *copy;
2367 pos_add_save = pos_add;
2368 status = tre_copy_ast(mem, stack, iter->arg, 0,
2369 &pos_add, NULL, ©, &max_pos);
2370 if (status != REG_OK)
2373 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2378 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2381 seq2 = tre_ast_new_union(mem, tmp, seq2);
2387 pos_add = pos_add_save;
2390 else if (seq2 != NULL)
2391 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2394 node->obj = seq1->obj;
2395 node->type = seq1->type;
2399 pos_add_total += pos_add - pos_add_last;
2400 if (iter_depth == 0)
2401 pos_add = pos_add_total;
2411 *position += pos_add_total;
2413 /* `max_pos' should never be larger than `*position' if the above
2414 code works, but just an extra safeguard let's make sure
2415 `*position' is set large enough so enough memory will be
2416 allocated for the transition table. */
2417 if (max_pos > *position)
2418 *position = max_pos;
2423 static tre_pos_and_tags_t *
2424 tre_set_empty(tre_mem_t mem)
2426 tre_pos_and_tags_t *new_set;
2428 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2429 if (new_set == NULL)
2432 new_set[0].position = -1;
2433 new_set[0].code_min = -1;
2434 new_set[0].code_max = -1;
2439 static tre_pos_and_tags_t *
2440 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2441 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2443 tre_pos_and_tags_t *new_set;
2445 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2446 if (new_set == NULL)
2449 new_set[0].position = position;
2450 new_set[0].code_min = code_min;
2451 new_set[0].code_max = code_max;
2452 new_set[0].class = class;
2453 new_set[0].neg_classes = neg_classes;
2454 new_set[0].backref = backref;
2455 new_set[1].position = -1;
2456 new_set[1].code_min = -1;
2457 new_set[1].code_max = -1;
2462 static tre_pos_and_tags_t *
2463 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2464 int *tags, int assertions)
2467 tre_pos_and_tags_t *new_set;
2471 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2472 for (s1 = 0; set1[s1].position >= 0; s1++);
2473 for (s2 = 0; set2[s2].position >= 0; s2++);
2474 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2478 for (s1 = 0; set1[s1].position >= 0; s1++)
2480 new_set[s1].position = set1[s1].position;
2481 new_set[s1].code_min = set1[s1].code_min;
2482 new_set[s1].code_max = set1[s1].code_max;
2483 new_set[s1].assertions = set1[s1].assertions | assertions;
2484 new_set[s1].class = set1[s1].class;
2485 new_set[s1].neg_classes = set1[s1].neg_classes;
2486 new_set[s1].backref = set1[s1].backref;
2487 if (set1[s1].tags == NULL && tags == NULL)
2488 new_set[s1].tags = NULL;
2491 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2492 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2493 * (i + num_tags + 1)));
2494 if (new_tags == NULL)
2496 for (j = 0; j < i; j++)
2497 new_tags[j] = set1[s1].tags[j];
2498 for (i = 0; i < num_tags; i++)
2499 new_tags[j + i] = tags[i];
2500 new_tags[j + i] = -1;
2501 new_set[s1].tags = new_tags;
2505 for (s2 = 0; set2[s2].position >= 0; s2++)
2507 new_set[s1 + s2].position = set2[s2].position;
2508 new_set[s1 + s2].code_min = set2[s2].code_min;
2509 new_set[s1 + s2].code_max = set2[s2].code_max;
2510 /* XXX - why not | assertions here as well? */
2511 new_set[s1 + s2].assertions = set2[s2].assertions;
2512 new_set[s1 + s2].class = set2[s2].class;
2513 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2514 new_set[s1 + s2].backref = set2[s2].backref;
2515 if (set2[s2].tags == NULL)
2516 new_set[s1 + s2].tags = NULL;
2519 for (i = 0; set2[s2].tags[i] >= 0; i++);
2520 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2521 if (new_tags == NULL)
2523 for (j = 0; j < i; j++)
2524 new_tags[j] = set2[s2].tags[j];
2526 new_set[s1 + s2].tags = new_tags;
2529 new_set[s1 + s2].position = -1;
2533 /* Finds the empty path through `node' which is the one that should be
2534 taken according to POSIX.2 rules, and adds the tags on that path to
2535 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2536 set to the number of tags seen on the path. */
2537 static reg_errcode_t
2538 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2539 int *assertions, int *params, int *num_tags_seen,
2544 tre_catenation_t *cat;
2545 tre_iteration_t *iter;
2547 int bottom = tre_stack_num_objects(stack);
2548 reg_errcode_t status = REG_OK;
2552 status = tre_stack_push_voidptr(stack, node);
2554 /* Walk through the tree recursively. */
2555 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2557 node = tre_stack_pop_voidptr(stack);
2562 lit = (tre_literal_t *)node->obj;
2563 switch (lit->code_min)
2566 if (lit->code_max >= 0)
2570 /* Add the tag to `tags'. */
2571 for (i = 0; tags[i] >= 0; i++)
2572 if (tags[i] == lit->code_max)
2576 tags[i] = lit->code_max;
2585 assert(lit->code_max >= 1
2586 || lit->code_max <= ASSERT_LAST);
2587 if (assertions != NULL)
2588 *assertions |= lit->code_max;
2599 /* Subexpressions starting earlier take priority over ones
2600 starting later, so we prefer the left subexpression over the
2601 right subexpression. */
2602 uni = (tre_union_t *)node->obj;
2603 if (uni->left->nullable)
2604 STACK_PUSHX(stack, voidptr, uni->left)
2605 else if (uni->right->nullable)
2606 STACK_PUSHX(stack, voidptr, uni->right)
2612 /* The path must go through both children. */
2613 cat = (tre_catenation_t *)node->obj;
2614 assert(cat->left->nullable);
2615 assert(cat->right->nullable);
2616 STACK_PUSHX(stack, voidptr, cat->left);
2617 STACK_PUSHX(stack, voidptr, cat->right);
2621 /* A match with an empty string is preferred over no match at
2622 all, so we go through the argument if possible. */
2623 iter = (tre_iteration_t *)node->obj;
2624 if (iter->arg->nullable)
2625 STACK_PUSHX(stack, voidptr, iter->arg);
2641 NFL_POST_CATENATION,
2643 } tre_nfl_stack_symbol_t;
2646 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2647 the nodes of the AST `tree'. */
2648 static reg_errcode_t
2649 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2651 int bottom = tre_stack_num_objects(stack);
2653 STACK_PUSHR(stack, voidptr, tree);
2654 STACK_PUSHR(stack, int, NFL_RECURSE);
2656 while (tre_stack_num_objects(stack) > bottom)
2658 tre_nfl_stack_symbol_t symbol;
2659 tre_ast_node_t *node;
2661 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2662 node = tre_stack_pop_voidptr(stack);
2670 tre_literal_t *lit = (tre_literal_t *)node->obj;
2671 if (IS_BACKREF(lit))
2673 /* Back references: nullable = false, firstpos = {i},
2676 node->firstpos = tre_set_one(mem, lit->position, 0,
2677 TRE_CHAR_MAX, 0, NULL, -1);
2678 if (!node->firstpos)
2680 node->lastpos = tre_set_one(mem, lit->position, 0,
2681 TRE_CHAR_MAX, 0, NULL,
2682 (int)lit->code_max);
2686 else if (lit->code_min < 0)
2688 /* Tags, empty strings, params, and zero width assertions:
2689 nullable = true, firstpos = {}, and lastpos = {}. */
2691 node->firstpos = tre_set_empty(mem);
2692 if (!node->firstpos)
2694 node->lastpos = tre_set_empty(mem);
2700 /* Literal at position i: nullable = false, firstpos = {i},
2704 tre_set_one(mem, lit->position, (int)lit->code_min,
2705 (int)lit->code_max, 0, NULL, -1);
2706 if (!node->firstpos)
2708 node->lastpos = tre_set_one(mem, lit->position,
2711 lit->u.class, lit->neg_classes,
2720 /* Compute the attributes for the two subtrees, and after that
2722 STACK_PUSHR(stack, voidptr, node);
2723 STACK_PUSHR(stack, int, NFL_POST_UNION);
2724 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2725 STACK_PUSHR(stack, int, NFL_RECURSE);
2726 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2727 STACK_PUSHR(stack, int, NFL_RECURSE);
2731 /* Compute the attributes for the two subtrees, and after that
2733 STACK_PUSHR(stack, voidptr, node);
2734 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2735 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2736 STACK_PUSHR(stack, int, NFL_RECURSE);
2737 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2738 STACK_PUSHR(stack, int, NFL_RECURSE);
2742 /* Compute the attributes for the subtree, and after that for
2744 STACK_PUSHR(stack, voidptr, node);
2745 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2746 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2747 STACK_PUSHR(stack, int, NFL_RECURSE);
2750 break; /* end case: NFL_RECURSE */
2752 case NFL_POST_UNION:
2754 tre_union_t *uni = (tre_union_t *)node->obj;
2755 node->nullable = uni->left->nullable || uni->right->nullable;
2756 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2757 uni->right->firstpos, NULL, 0);
2758 if (!node->firstpos)
2760 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2761 uni->right->lastpos, NULL, 0);
2767 case NFL_POST_ITERATION:
2769 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2771 if (iter->min == 0 || iter->arg->nullable)
2775 node->firstpos = iter->arg->firstpos;
2776 node->lastpos = iter->arg->lastpos;
2780 case NFL_POST_CATENATION:
2782 int num_tags, *tags, assertions, params_seen;
2784 reg_errcode_t status;
2785 tre_catenation_t *cat = node->obj;
2786 node->nullable = cat->left->nullable && cat->right->nullable;
2788 /* Compute firstpos. */
2789 if (cat->left->nullable)
2791 /* The left side matches the empty string. Make a first pass
2792 with tre_match_empty() to get the number of tags and
2794 status = tre_match_empty(stack, cat->left,
2795 NULL, NULL, NULL, &num_tags,
2797 if (status != REG_OK)
2799 /* Allocate arrays for the tags and parameters. */
2800 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2805 /* Second pass with tre_mach_empty() to get the list of
2806 tags and parameters. */
2807 status = tre_match_empty(stack, cat->left, tags,
2808 &assertions, params, NULL, NULL);
2809 if (status != REG_OK)
2815 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2818 if (!node->firstpos)
2823 node->firstpos = cat->left->firstpos;
2826 /* Compute lastpos. */
2827 if (cat->right->nullable)
2829 /* The right side matches the empty string. Make a first pass
2830 with tre_match_empty() to get the number of tags and
2832 status = tre_match_empty(stack, cat->right,
2833 NULL, NULL, NULL, &num_tags,
2835 if (status != REG_OK)
2837 /* Allocate arrays for the tags and parameters. */
2838 tags = xmalloc(sizeof(int) * (num_tags + 1));
2843 /* Second pass with tre_mach_empty() to get the list of
2844 tags and parameters. */
2845 status = tre_match_empty(stack, cat->right, tags,
2846 &assertions, params, NULL, NULL);
2847 if (status != REG_OK)
2853 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2861 node->lastpos = cat->right->lastpos;
2876 /* Adds a transition from each position in `p1' to each position in `p2'. */
2877 static reg_errcode_t
2878 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2879 tre_tnfa_transition_t *transitions,
2880 int *counts, int *offs)
2882 tre_pos_and_tags_t *orig_p2 = p2;
2883 tre_tnfa_transition_t *trans;
2884 int i, j, k, l, dup, prev_p2_pos;
2886 if (transitions != NULL)
2887 while (p1->position >= 0)
2891 while (p2->position >= 0)
2893 /* Optimization: if this position was already handled, skip it. */
2894 if (p2->position == prev_p2_pos)
2899 prev_p2_pos = p2->position;
2900 /* Set `trans' to point to the next unused transition from
2901 position `p1->position'. */
2902 trans = transitions + offs[p1->position];
2903 while (trans->state != NULL)
2906 /* If we find a previous transition from `p1->position' to
2907 `p2->position', it is overwritten. This can happen only
2908 if there are nested loops in the regexp, like in "((a)*)*".
2909 In POSIX.2 repetition using the outer loop is always
2910 preferred over using the inner loop. Therefore the
2911 transition for the inner loop is useless and can be thrown
2913 /* XXX - The same position is used for all nodes in a bracket
2914 expression, so this optimization cannot be used (it will
2915 break bracket expressions) unless I figure out a way to
2917 if (trans->state_id == p2->position)
2925 if (trans->state == NULL)
2926 (trans + 1)->state = NULL;
2927 /* Use the character ranges, assertions, etc. from `p1' for
2928 the transition from `p1' to `p2'. */
2929 trans->code_min = p1->code_min;
2930 trans->code_max = p1->code_max;
2931 trans->state = transitions + offs[p2->position];
2932 trans->state_id = p2->position;
2933 trans->assertions = p1->assertions | p2->assertions
2934 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2935 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2936 if (p1->backref >= 0)
2938 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2939 assert(p2->backref < 0);
2940 trans->u.backref = p1->backref;
2941 trans->assertions |= ASSERT_BACKREF;
2944 trans->u.class = p1->class;
2945 if (p1->neg_classes != NULL)
2947 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2948 trans->neg_classes =
2949 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2950 if (trans->neg_classes == NULL)
2952 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2953 trans->neg_classes[i] = p1->neg_classes[i];
2954 trans->neg_classes[i] = (tre_ctype_t)0;
2957 trans->neg_classes = NULL;
2959 /* Find out how many tags this transition has. */
2961 if (p1->tags != NULL)
2962 while(p1->tags[i] >= 0)
2965 if (p2->tags != NULL)
2966 while(p2->tags[j] >= 0)
2969 /* If we are overwriting a transition, free the old tag array. */
2970 if (trans->tags != NULL)
2974 /* If there were any tags, allocate an array and fill it. */
2977 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2981 if (p1->tags != NULL)
2982 while(p1->tags[i] >= 0)
2984 trans->tags[i] = p1->tags[i];
2989 if (p2->tags != NULL)
2990 while (p2->tags[j] >= 0)
2992 /* Don't add duplicates. */
2994 for (k = 0; k < i; k++)
2995 if (trans->tags[k] == p2->tags[j])
3001 trans->tags[l++] = p2->tags[j];
3004 trans->tags[l] = -1;
3012 /* Compute a maximum limit for the number of transitions leaving
3014 while (p1->position >= 0)
3017 while (p2->position >= 0)
3019 counts[p1->position]++;
3027 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3028 labelled with one character range (there are no transitions on empty
3029 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3031 static reg_errcode_t
3032 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3033 int *counts, int *offs)
3036 tre_catenation_t *cat;
3037 tre_iteration_t *iter;
3038 reg_errcode_t errcode = REG_OK;
3040 /* XXX - recurse using a stack!. */
3046 uni = (tre_union_t *)node->obj;
3047 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3048 if (errcode != REG_OK)
3050 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3054 cat = (tre_catenation_t *)node->obj;
3055 /* Add a transition from each position in cat->left->lastpos
3056 to each position in cat->right->firstpos. */
3057 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3058 transitions, counts, offs);
3059 if (errcode != REG_OK)
3061 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3062 if (errcode != REG_OK)
3064 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3068 iter = (tre_iteration_t *)node->obj;
3069 assert(iter->max == -1 || iter->max == 1);
3071 if (iter->max == -1)
3073 assert(iter->min == 0 || iter->min == 1);
3074 /* Add a transition from each last position in the iterated
3075 expression to each first position. */
3076 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3077 transitions, counts, offs);
3078 if (errcode != REG_OK)
3081 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3088 #define ERROR_EXIT(err) \
3092 if (/*CONSTCOND*/1) \
3095 while (/*CONSTCOND*/0)
3099 regcomp(regex_t *preg, const char *regex, int cflags)
3102 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3103 tre_pos_and_tags_t *p;
3104 int *counts = NULL, *offs = NULL;
3106 tre_tnfa_transition_t *transitions, *initial;
3107 tre_tnfa_t *tnfa = NULL;
3108 tre_submatch_data_t *submatch_data;
3109 tre_tag_direction_t *tag_directions = NULL;
3110 reg_errcode_t errcode;
3113 /* Parse context. */
3114 tre_parse_ctx_t parse_ctx;
3116 /* Allocate a stack used throughout the compilation process for various
3118 stack = tre_stack_new(512, 10240, 128);
3121 /* Allocate a fast memory allocator. */
3122 mem = tre_mem_new();
3125 tre_stack_destroy(stack);
3129 /* Parse the regexp. */
3130 memset(&parse_ctx, 0, sizeof(parse_ctx));
3131 parse_ctx.mem = mem;
3132 parse_ctx.stack = stack;
3133 parse_ctx.re = regex;
3134 parse_ctx.cflags = cflags;
3135 parse_ctx.max_backref = -1;
3136 errcode = tre_parse(&parse_ctx);
3137 if (errcode != REG_OK)
3138 ERROR_EXIT(errcode);
3139 preg->re_nsub = parse_ctx.submatch_id - 1;
3140 tree = parse_ctx.result;
3142 /* Back references and approximate matching cannot currently be used
3143 in the same regexp. */
3144 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3145 ERROR_EXIT(REG_BADPAT);
3148 tre_ast_print(tree);
3149 #endif /* TRE_DEBUG */
3151 /* Referring to nonexistent subexpressions is illegal. */
3152 if (parse_ctx.max_backref > (int)preg->re_nsub)
3153 ERROR_EXIT(REG_ESUBREG);
3155 /* Allocate the TNFA struct. */
3156 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3158 ERROR_EXIT(REG_ESPACE);
3159 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3160 tnfa->have_approx = parse_ctx.have_approx;
3161 tnfa->num_submatches = parse_ctx.submatch_id;
3163 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3164 regexp does not have back references, this can be skipped. */
3165 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3168 /* Figure out how many tags we will need. */
3169 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3170 if (errcode != REG_OK)
3171 ERROR_EXIT(errcode);
3173 if (tnfa->num_tags > 0)
3175 tag_directions = xmalloc(sizeof(*tag_directions)
3176 * (tnfa->num_tags + 1));
3177 if (tag_directions == NULL)
3178 ERROR_EXIT(REG_ESPACE);
3179 tnfa->tag_directions = tag_directions;
3180 memset(tag_directions, -1,
3181 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3183 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3184 sizeof(tnfa->minimal_tags));
3185 if (tnfa->minimal_tags == NULL)
3186 ERROR_EXIT(REG_ESPACE);
3188 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3189 sizeof(*submatch_data));
3190 if (submatch_data == NULL)
3191 ERROR_EXIT(REG_ESPACE);
3192 tnfa->submatch_data = submatch_data;
3194 errcode = tre_add_tags(mem, stack, tree, tnfa);
3195 if (errcode != REG_OK)
3196 ERROR_EXIT(errcode);
3200 /* Expand iteration nodes. */
3201 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3202 tag_directions, &tnfa->params_depth);
3203 if (errcode != REG_OK)
3204 ERROR_EXIT(errcode);
3206 /* Add a dummy node for the final state.
3207 XXX - For certain patterns this dummy node can be optimized away,
3208 for example "a*" or "ab*". Figure out a simple way to detect
3209 this possibility. */
3211 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3212 if (tmp_ast_r == NULL)
3213 ERROR_EXIT(REG_ESPACE);
3215 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3217 ERROR_EXIT(REG_ESPACE);
3219 errcode = tre_compute_nfl(mem, stack, tree);
3220 if (errcode != REG_OK)
3221 ERROR_EXIT(errcode);
3223 counts = xmalloc(sizeof(int) * parse_ctx.position);
3225 ERROR_EXIT(REG_ESPACE);
3227 offs = xmalloc(sizeof(int) * parse_ctx.position);
3229 ERROR_EXIT(REG_ESPACE);
3231 for (i = 0; i < parse_ctx.position; i++)
3233 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3236 for (i = 0; i < parse_ctx.position; i++)
3239 add += counts[i] + 1;
3242 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3243 if (transitions == NULL)
3244 ERROR_EXIT(REG_ESPACE);
3245 tnfa->transitions = transitions;
3246 tnfa->num_transitions = add;
3248 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3249 if (errcode != REG_OK)
3250 ERROR_EXIT(errcode);
3252 tnfa->firstpos_chars = NULL;
3256 while (p->position >= 0)
3262 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3263 if (initial == NULL)
3264 ERROR_EXIT(REG_ESPACE);
3265 tnfa->initial = initial;
3268 for (p = tree->firstpos; p->position >= 0; p++)
3270 initial[i].state = transitions + offs[p->position];
3271 initial[i].state_id = p->position;
3272 initial[i].tags = NULL;
3273 /* Copy the arrays p->tags, and p->params, they are allocated
3274 from a tre_mem object. */
3278 for (j = 0; p->tags[j] >= 0; j++);
3279 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3280 if (!initial[i].tags)
3281 ERROR_EXIT(REG_ESPACE);
3282 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3284 initial[i].assertions = p->assertions;
3287 initial[i].state = NULL;
3289 tnfa->num_transitions = add;
3290 tnfa->final = transitions + offs[tree->lastpos[0].position];
3291 tnfa->num_states = parse_ctx.position;
3292 tnfa->cflags = cflags;
3294 tre_mem_destroy(mem);
3295 tre_stack_destroy(stack);
3299 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3303 /* Free everything that was allocated and return the error code. */
3304 tre_mem_destroy(mem);
3306 tre_stack_destroy(stack);
3311 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3320 regfree(regex_t *preg)
3324 tre_tnfa_transition_t *trans;
3326 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3330 for (i = 0; i < tnfa->num_transitions; i++)
3331 if (tnfa->transitions[i].state)
3333 if (tnfa->transitions[i].tags)
3334 xfree(tnfa->transitions[i].tags);
3335 if (tnfa->transitions[i].neg_classes)
3336 xfree(tnfa->transitions[i].neg_classes);
3338 if (tnfa->transitions)
3339 xfree(tnfa->transitions);
3343 for (trans = tnfa->initial; trans->state; trans++)
3348 xfree(tnfa->initial);
3351 if (tnfa->submatch_data)
3353 for (i = 0; i < tnfa->num_submatches; i++)
3354 if (tnfa->submatch_data[i].parents)
3355 xfree(tnfa->submatch_data[i].parents);
3356 xfree(tnfa->submatch_data);
3359 if (tnfa->tag_directions)
3360 xfree(tnfa->tag_directions);
3361 if (tnfa->firstpos_chars)
3362 xfree(tnfa->firstpos_chars);
3363 if (tnfa->minimal_tags)
3364 xfree(tnfa->minimal_tags);