2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
967 if (!ctx->nofirstsub)
969 STACK_PUSH(stack, int, ctx->submatch_id);
970 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
973 STACK_PUSH(stack, int, PARSE_RE);
974 ctx->re_start = ctx->re;
977 /* The following is basically just a recursive descent parser. I use
978 an explicit stack instead of recursive functions mostly because of
979 two reasons: compatibility with systems which have an overflowable
980 call stack, and efficiency (both in lines of code and speed). */
981 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
983 if (status != REG_OK)
985 symbol = tre_stack_pop_int(stack);
989 /* Parse a full regexp. A regexp is one or more branches,
990 separated by the union operator `|'. */
991 if (ctx->cflags & REG_EXTENDED)
992 STACK_PUSHX(stack, int, PARSE_UNION);
993 STACK_PUSHX(stack, int, PARSE_BRANCH);
997 /* Parse a branch. A branch is one or more pieces, concatenated.
998 A piece is an atom possibly followed by a postfix operator. */
999 STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 STACK_PUSHX(stack, int, PARSE_PIECE);
1004 /* Parse a piece. A piece is an atom possibly followed by one
1005 or more postfix operators. */
1006 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1007 STACK_PUSHX(stack, int, PARSE_ATOM);
1010 case PARSE_CATENATION:
1011 /* If the expression has not ended, parse another piece. */
1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1019 if ((ctx->cflags & REG_EXTENDED
1020 && c == CHAR_RPAREN && depth > 0)
1021 || (!(ctx->cflags & REG_EXTENDED)
1022 && (c == CHAR_BACKSLASH
1023 && *(ctx->re + 1) == CHAR_RPAREN)))
1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 status = REG_EPAREN;
1028 if (!(ctx->cflags & REG_EXTENDED))
1034 /* Default case, left associative concatenation. */
1035 STACK_PUSHX(stack, int, PARSE_CATENATION);
1036 STACK_PUSHX(stack, voidptr, result);
1037 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1038 STACK_PUSHX(stack, int, PARSE_PIECE);
1043 case PARSE_POST_CATENATION:
1045 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1046 tre_ast_node_t *tmp_node;
1047 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1060 STACK_PUSHX(stack, int, PARSE_UNION);
1061 STACK_PUSHX(stack, voidptr, result);
1062 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1063 STACK_PUSHX(stack, int, PARSE_BRANCH);
1076 case PARSE_POST_UNION:
1078 tre_ast_node_t *tmp_node;
1079 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1080 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1088 /* Parse postfix operators. */
1094 case CHAR_QUESTIONMARK:
1095 if (!(ctx->cflags & REG_EXTENDED))
1100 tre_ast_node_t *tmp_node;
1105 if (*ctx->re == CHAR_PLUS)
1107 if (*ctx->re == CHAR_QUESTIONMARK)
1111 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1113 if (tmp_node == NULL)
1116 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1120 case CHAR_BACKSLASH:
1121 /* "\{" is special without REG_EXTENDED */
1122 if (!(ctx->cflags & REG_EXTENDED)
1123 && *(ctx->re + 1) == CHAR_LBRACE)
1132 /* "{" is literal without REG_EXTENDED */
1133 if (!(ctx->cflags & REG_EXTENDED))
1139 status = tre_parse_bound(ctx, &result);
1140 if (status != REG_OK)
1142 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1148 /* Parse an atom. An atom is a regular expression enclosed in `()',
1149 an empty set of `()', a bracket expression, `.', `^', `$',
1150 a `\' followed by a character, or a single character. */
1154 case CHAR_LPAREN: /* parenthesized subexpression */
1156 if (ctx->cflags & REG_EXTENDED)
1162 /* First parse a whole RE, then mark the resulting tree
1164 STACK_PUSHX(stack, int, ctx->submatch_id);
1165 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1166 STACK_PUSHX(stack, int, PARSE_RE);
1174 case CHAR_LBRACKET: /* bracket expression */
1176 status = tre_parse_bracket(ctx, &result);
1177 if (status != REG_OK)
1181 case CHAR_BACKSLASH:
1182 /* If this is "\(" or "\)" chew off the backslash and
1184 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1189 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1194 /* If a macro is used, parse the expanded macro recursively. */
1196 const char *buf = tre_expand_macro(ctx->re + 1);
1199 tre_parse_ctx_t subctx;
1200 memcpy(&subctx, ctx, sizeof(subctx));
1202 subctx.nofirstsub = 1;
1203 status = tre_parse(&subctx);
1204 if (status != REG_OK)
1207 ctx->position = subctx.position;
1208 result = subctx.result;
1214 /* Trailing backslash. */
1221 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1226 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1227 ASSERT_AT_WB_NEG, -1);
1231 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1236 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1242 if (ctx->re[0] != CHAR_LBRACE)
1244 /* 8 bit hex char. */
1245 char tmp[3] = {0, 0, 0};
1248 if (tre_isxdigit(ctx->re[0]))
1250 tmp[0] = (char)ctx->re[0];
1253 if (tre_isxdigit(ctx->re[0]))
1255 tmp[1] = (char)ctx->re[0];
1258 val = strtol(tmp, NULL, 16);
1259 result = tre_ast_new_literal(ctx->mem, (int)val,
1260 (int)val, ctx->position);
1271 while (*ctx->re && i < sizeof tmp)
1273 if (ctx->re[0] == CHAR_RBRACE)
1275 if (tre_isxdigit(ctx->re[0]))
1277 tmp[i] = (char)ctx->re[0];
1286 val = strtol(tmp, NULL, 16);
1287 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1295 if (tre_isdigit(*ctx->re))
1297 /* Back reference. */
1298 int val = *ctx->re - '0';
1299 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1304 ctx->max_backref = MAX(val, ctx->max_backref);
1309 /* Escaped character. */
1310 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1321 case CHAR_PERIOD: /* the any-symbol */
1322 if (ctx->cflags & REG_NEWLINE)
1324 tre_ast_node_t *tmp1;
1325 tre_ast_node_t *tmp2;
1326 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1330 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1334 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1341 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1350 case CHAR_CARET: /* beginning of line assertion */
1351 /* '^' has a special meaning everywhere in EREs, and at
1352 beginning of BRE. */
1353 if (ctx->cflags & REG_EXTENDED
1354 || ctx->re == ctx->re_start)
1356 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1366 case CHAR_DOLLAR: /* end of line assertion. */
1367 /* '$' is special everywhere in EREs, and in the end of the
1369 if (ctx->cflags & REG_EXTENDED
1372 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1389 case CHAR_QUESTIONMARK:
1390 if (!(ctx->cflags & REG_EXTENDED))
1395 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1403 clen = mbtowc(&wc, ctx->re, -1);
1404 if (clen<0) clen=1, wc=WEOF;
1406 /* Note that we can't use an tre_isalpha() test here, since there
1407 may be characters which are alphabetic but neither upper or
1409 if (ctx->cflags & REG_ICASE
1410 && (tre_isupper(wc) || tre_islower(wc)))
1412 tre_ast_node_t *tmp1;
1413 tre_ast_node_t *tmp2;
1415 /* XXX - Can there be more than one opposite-case
1416 counterpoints for some character in some locale? Or
1417 more than two characters which all should be regarded
1418 the same character if case is ignored? If yes, there
1419 does not seem to be a portable way to detect it. I guess
1420 that at least for multi-character collating elements there
1421 could be several opposite-case counterpoints, but they
1422 cannot be supported portably anyway. */
1423 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1428 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1433 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1439 result = tre_ast_new_literal(ctx->mem, wc, wc,
1450 case PARSE_MARK_FOR_SUBMATCH:
1452 int submatch_id = tre_stack_pop_int(stack);
1454 if (result->submatch_id >= 0)
1456 tre_ast_node_t *n, *tmp_node;
1457 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1460 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1461 if (tmp_node == NULL)
1463 tmp_node->num_submatches = result->num_submatches;
1466 result->submatch_id = submatch_id;
1467 result->num_submatches++;
1471 case PARSE_RESTORE_CFLAGS:
1472 ctx->cflags = tre_stack_pop_int(stack);
1481 /* Check for missing closing parentheses. */
1485 if (status == REG_OK)
1486 ctx->result = result;
1493 /***********************************************************************
1495 ***********************************************************************/
1500 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1505 Algorithms to setup tags so that submatch addressing can be done.
1509 /* Inserts a catenation node to the root of the tree given in `node'.
1510 As the left child a new tag with number `tag_id' to `node' is added,
1511 and the right child is the old root. */
1512 static reg_errcode_t
1513 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1515 tre_catenation_t *c;
1517 c = tre_mem_alloc(mem, sizeof(*c));
1520 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1521 if (c->left == NULL)
1523 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1524 if (c->right == NULL)
1527 c->right->obj = node->obj;
1528 c->right->type = node->type;
1529 c->right->nullable = -1;
1530 c->right->submatch_id = -1;
1531 c->right->firstpos = NULL;
1532 c->right->lastpos = NULL;
1533 c->right->num_tags = 0;
1535 node->type = CATENATION;
1539 /* Inserts a catenation node to the root of the tree given in `node'.
1540 As the right child a new tag with number `tag_id' to `node' is added,
1541 and the left child is the old root. */
1542 static reg_errcode_t
1543 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1545 tre_catenation_t *c;
1547 c = tre_mem_alloc(mem, sizeof(*c));
1550 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1551 if (c->right == NULL)
1553 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1554 if (c->left == NULL)
1557 c->left->obj = node->obj;
1558 c->left->type = node->type;
1559 c->left->nullable = -1;
1560 c->left->submatch_id = -1;
1561 c->left->firstpos = NULL;
1562 c->left->lastpos = NULL;
1563 c->left->num_tags = 0;
1565 node->type = CATENATION;
1571 ADDTAGS_AFTER_ITERATION,
1572 ADDTAGS_AFTER_UNION_LEFT,
1573 ADDTAGS_AFTER_UNION_RIGHT,
1574 ADDTAGS_AFTER_CAT_LEFT,
1575 ADDTAGS_AFTER_CAT_RIGHT,
1576 ADDTAGS_SET_SUBMATCH_END
1577 } tre_addtags_symbol_t;
1586 /* Go through `regset' and set submatch data for submatches that are
1589 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1593 for (i = 0; regset[i] >= 0; i++)
1595 int id = regset[i] / 2;
1596 int start = !(regset[i] % 2);
1598 tnfa->submatch_data[id].so_tag = tag;
1600 tnfa->submatch_data[id].eo_tag = tag;
1606 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1607 subexpressions marked for submatch addressing can be traced. */
1608 static reg_errcode_t
1609 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1612 reg_errcode_t status = REG_OK;
1613 tre_addtags_symbol_t symbol;
1614 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1615 int bottom = tre_stack_num_objects(stack);
1616 /* True for first pass (counting number of needed tags) */
1617 int first_pass = (mem == NULL || tnfa == NULL);
1618 int *regset, *orig_regset;
1619 int num_tags = 0; /* Total number of tags. */
1620 int num_minimals = 0; /* Number of special minimal tags. */
1621 int tag = 0; /* The tag that is to be added next. */
1622 int next_tag = 1; /* Next tag to use after this one. */
1623 int *parents; /* Stack of submatches the current submatch is
1625 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1626 tre_tag_states_t *saved_states;
1628 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1632 tnfa->minimal_tags[0] = -1;
1635 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1639 orig_regset = regset;
1641 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1642 if (parents == NULL)
1649 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1650 if (saved_states == NULL)
1659 for (i = 0; i <= tnfa->num_submatches; i++)
1660 saved_states[i].tag = -1;
1663 STACK_PUSH(stack, voidptr, node);
1664 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1666 while (tre_stack_num_objects(stack) > bottom)
1668 if (status != REG_OK)
1671 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1675 case ADDTAGS_SET_SUBMATCH_END:
1677 int id = tre_stack_pop_int(stack);
1680 /* Add end of this submatch to regset. */
1681 for (i = 0; regset[i] >= 0; i++);
1682 regset[i] = id * 2 + 1;
1685 /* Pop this submatch from the parents stack. */
1686 for (i = 0; parents[i] >= 0; i++);
1687 parents[i - 1] = -1;
1691 case ADDTAGS_RECURSE:
1692 node = tre_stack_pop_voidptr(stack);
1694 if (node->submatch_id >= 0)
1696 int id = node->submatch_id;
1700 /* Add start of this submatch to regset. */
1701 for (i = 0; regset[i] >= 0; i++);
1707 for (i = 0; parents[i] >= 0; i++);
1708 tnfa->submatch_data[id].parents = NULL;
1711 int *p = xmalloc(sizeof(*p) * (i + 1));
1714 status = REG_ESPACE;
1717 assert(tnfa->submatch_data[id].parents == NULL);
1718 tnfa->submatch_data[id].parents = p;
1719 for (i = 0; parents[i] >= 0; i++)
1725 /* Add end of this submatch to regset after processing this
1727 STACK_PUSHX(stack, int, node->submatch_id);
1728 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1735 tre_literal_t *lit = node->obj;
1737 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1742 /* Regset is not empty, so add a tag before the
1743 literal or backref. */
1746 status = tre_add_tag_left(mem, node, tag);
1747 tnfa->tag_directions[tag] = direction;
1748 if (minimal_tag >= 0)
1750 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1751 tnfa->minimal_tags[i] = tag;
1752 tnfa->minimal_tags[i + 1] = minimal_tag;
1753 tnfa->minimal_tags[i + 2] = -1;
1757 tre_purge_regset(regset, tnfa, tag);
1772 assert(!IS_TAG(lit));
1778 tre_catenation_t *cat = node->obj;
1779 tre_ast_node_t *left = cat->left;
1780 tre_ast_node_t *right = cat->right;
1781 int reserved_tag = -1;
1784 /* After processing right child. */
1785 STACK_PUSHX(stack, voidptr, node);
1786 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1788 /* Process right child. */
1789 STACK_PUSHX(stack, voidptr, right);
1790 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1792 /* After processing left child. */
1793 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1794 if (left->num_tags > 0 && right->num_tags > 0)
1796 /* Reserve the next tag to the right child. */
1797 reserved_tag = next_tag;
1800 STACK_PUSHX(stack, int, reserved_tag);
1801 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1803 /* Process left child. */
1804 STACK_PUSHX(stack, voidptr, left);
1805 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1811 tre_iteration_t *iter = node->obj;
1815 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1819 STACK_PUSHX(stack, int, tag);
1820 STACK_PUSHX(stack, int, iter->minimal);
1822 STACK_PUSHX(stack, voidptr, node);
1823 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1825 STACK_PUSHX(stack, voidptr, iter->arg);
1826 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1828 /* Regset is not empty, so add a tag here. */
1829 if (regset[0] >= 0 || iter->minimal)
1834 status = tre_add_tag_left(mem, node, tag);
1836 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1838 tnfa->tag_directions[tag] = direction;
1839 if (minimal_tag >= 0)
1841 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1842 tnfa->minimal_tags[i] = tag;
1843 tnfa->minimal_tags[i + 1] = minimal_tag;
1844 tnfa->minimal_tags[i + 2] = -1;
1848 tre_purge_regset(regset, tnfa, tag);
1856 direction = TRE_TAG_MINIMIZE;
1861 tre_union_t *uni = node->obj;
1862 tre_ast_node_t *left = uni->left;
1863 tre_ast_node_t *right = uni->right;
1869 left_tag = next_tag;
1870 right_tag = next_tag + 1;
1875 right_tag = next_tag;
1878 /* After processing right child. */
1879 STACK_PUSHX(stack, int, right_tag);
1880 STACK_PUSHX(stack, int, left_tag);
1881 STACK_PUSHX(stack, voidptr, regset);
1882 STACK_PUSHX(stack, int, regset[0] >= 0);
1883 STACK_PUSHX(stack, voidptr, node);
1884 STACK_PUSHX(stack, voidptr, right);
1885 STACK_PUSHX(stack, voidptr, left);
1886 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1888 /* Process right child. */
1889 STACK_PUSHX(stack, voidptr, right);
1890 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1892 /* After processing left child. */
1893 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1895 /* Process left child. */
1896 STACK_PUSHX(stack, voidptr, left);
1897 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1899 /* Regset is not empty, so add a tag here. */
1905 status = tre_add_tag_left(mem, node, tag);
1906 tnfa->tag_directions[tag] = direction;
1907 if (minimal_tag >= 0)
1909 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1910 tnfa->minimal_tags[i] = tag;
1911 tnfa->minimal_tags[i + 1] = minimal_tag;
1912 tnfa->minimal_tags[i + 2] = -1;
1916 tre_purge_regset(regset, tnfa, tag);
1925 if (node->num_submatches > 0)
1927 /* The next two tags are reserved for markers. */
1937 if (node->submatch_id >= 0)
1940 /* Push this submatch on the parents stack. */
1941 for (i = 0; parents[i] >= 0; i++);
1942 parents[i] = node->submatch_id;
1943 parents[i + 1] = -1;
1946 break; /* end case: ADDTAGS_RECURSE */
1948 case ADDTAGS_AFTER_ITERATION:
1952 node = tre_stack_pop_voidptr(stack);
1955 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1956 + tre_stack_pop_int(stack);
1961 minimal = tre_stack_pop_int(stack);
1962 enter_tag = tre_stack_pop_int(stack);
1964 minimal_tag = enter_tag;
1970 direction = TRE_TAG_MINIMIZE;
1972 direction = TRE_TAG_MAXIMIZE;
1977 case ADDTAGS_AFTER_CAT_LEFT:
1979 int new_tag = tre_stack_pop_int(stack);
1980 next_tag = tre_stack_pop_int(stack);
1988 case ADDTAGS_AFTER_CAT_RIGHT:
1989 node = tre_stack_pop_voidptr(stack);
1991 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1992 + ((tre_catenation_t *)node->obj)->right->num_tags;
1995 case ADDTAGS_AFTER_UNION_LEFT:
1996 /* Lift the bottom of the `regset' array so that when processing
1997 the right operand the items currently in the array are
1998 invisible. The original bottom was saved at ADDTAGS_UNION and
1999 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2000 while (*regset >= 0)
2004 case ADDTAGS_AFTER_UNION_RIGHT:
2006 int added_tags, tag_left, tag_right;
2007 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2008 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2009 node = tre_stack_pop_voidptr(stack);
2010 added_tags = tre_stack_pop_int(stack);
2013 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2014 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2015 + ((node->num_submatches > 0) ? 2 : 0);
2017 regset = tre_stack_pop_voidptr(stack);
2018 tag_left = tre_stack_pop_int(stack);
2019 tag_right = tre_stack_pop_int(stack);
2021 /* Add tags after both children, the left child gets a smaller
2022 tag than the right child. This guarantees that we prefer
2023 the left child over the right child. */
2024 /* XXX - This is not always necessary (if the children have
2025 tags which must be seen for every match of that child). */
2026 /* XXX - Check if this is the only place where tre_add_tag_right
2027 is used. If so, use tre_add_tag_left (putting the tag before
2028 the child as opposed after the child) and throw away
2029 tre_add_tag_right. */
2030 if (node->num_submatches > 0)
2034 status = tre_add_tag_right(mem, left, tag_left);
2035 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2036 status = tre_add_tag_right(mem, right, tag_right);
2037 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2041 direction = TRE_TAG_MAXIMIZE;
2049 } /* end switch(symbol) */
2050 } /* end while(tre_stack_num_objects(stack) > bottom) */
2053 tre_purge_regset(regset, tnfa, tag);
2055 if (!first_pass && minimal_tag >= 0)
2058 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2059 tnfa->minimal_tags[i] = tag;
2060 tnfa->minimal_tags[i + 1] = minimal_tag;
2061 tnfa->minimal_tags[i + 2] = -1;
2066 assert(tree->num_tags == num_tags);
2067 tnfa->end_tag = num_tags;
2068 tnfa->num_tags = num_tags;
2069 tnfa->num_minimals = num_minimals;
2072 xfree(saved_states);
2079 AST to TNFA compilation routines.
2085 } tre_copyast_symbol_t;
2087 /* Flags for tre_copy_ast(). */
2088 #define COPY_REMOVE_TAGS 1
2089 #define COPY_MAXIMIZE_FIRST_TAG 2
2091 static reg_errcode_t
2092 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2093 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2094 tre_ast_node_t **copy, int *max_pos)
2096 reg_errcode_t status = REG_OK;
2097 int bottom = tre_stack_num_objects(stack);
2100 tre_ast_node_t **result = copy;
2101 tre_copyast_symbol_t symbol;
2103 STACK_PUSH(stack, voidptr, ast);
2104 STACK_PUSH(stack, int, COPY_RECURSE);
2106 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2108 tre_ast_node_t *node;
2109 if (status != REG_OK)
2112 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2115 case COPY_SET_RESULT_PTR:
2116 result = tre_stack_pop_voidptr(stack);
2119 node = tre_stack_pop_voidptr(stack);
2124 tre_literal_t *lit = node->obj;
2125 int pos = lit->position;
2126 int min = lit->code_min;
2127 int max = lit->code_max;
2128 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2130 /* XXX - e.g. [ab] has only one position but two
2131 nodes, so we are creating holes in the state space
2132 here. Not fatal, just wastes memory. */
2136 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2138 /* Change this tag to empty. */
2142 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2145 /* Maximize the first tag. */
2146 tag_directions[max] = TRE_TAG_MAXIMIZE;
2149 *result = tre_ast_new_literal(mem, min, max, pos);
2150 if (*result == NULL)
2151 status = REG_ESPACE;
2159 tre_union_t *uni = node->obj;
2161 *result = tre_ast_new_union(mem, uni->left, uni->right);
2162 if (*result == NULL)
2164 status = REG_ESPACE;
2167 tmp = (*result)->obj;
2168 result = &tmp->left;
2169 STACK_PUSHX(stack, voidptr, uni->right);
2170 STACK_PUSHX(stack, int, COPY_RECURSE);
2171 STACK_PUSHX(stack, voidptr, &tmp->right);
2172 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2173 STACK_PUSHX(stack, voidptr, uni->left);
2174 STACK_PUSHX(stack, int, COPY_RECURSE);
2179 tre_catenation_t *cat = node->obj;
2180 tre_catenation_t *tmp;
2181 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2182 if (*result == NULL)
2184 status = REG_ESPACE;
2187 tmp = (*result)->obj;
2190 result = &tmp->left;
2192 STACK_PUSHX(stack, voidptr, cat->right);
2193 STACK_PUSHX(stack, int, COPY_RECURSE);
2194 STACK_PUSHX(stack, voidptr, &tmp->right);
2195 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2196 STACK_PUSHX(stack, voidptr, cat->left);
2197 STACK_PUSHX(stack, int, COPY_RECURSE);
2202 tre_iteration_t *iter = node->obj;
2203 STACK_PUSHX(stack, voidptr, iter->arg);
2204 STACK_PUSHX(stack, int, COPY_RECURSE);
2205 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2206 iter->max, iter->minimal);
2207 if (*result == NULL)
2209 status = REG_ESPACE;
2212 iter = (*result)->obj;
2213 result = &iter->arg;
2223 *pos_add += num_copied;
2230 } tre_expand_ast_symbol_t;
2232 /* Expands each iteration node that has a finite nonzero minimum or maximum
2233 iteration count to a catenated sequence of copies of the node. */
2234 static reg_errcode_t
2235 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2236 int *position, tre_tag_direction_t *tag_directions,
2239 reg_errcode_t status = REG_OK;
2240 int bottom = tre_stack_num_objects(stack);
2242 int pos_add_total = 0;
2246 STACK_PUSHR(stack, voidptr, ast);
2247 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2248 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2250 tre_ast_node_t *node;
2251 tre_expand_ast_symbol_t symbol;
2253 if (status != REG_OK)
2256 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2257 node = tre_stack_pop_voidptr(stack);
2260 case EXPAND_RECURSE:
2265 tre_literal_t *lit= node->obj;
2266 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2268 lit->position += pos_add;
2269 if (lit->position > max_pos)
2270 max_pos = lit->position;
2276 tre_union_t *uni = node->obj;
2277 STACK_PUSHX(stack, voidptr, uni->right);
2278 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2279 STACK_PUSHX(stack, voidptr, uni->left);
2280 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2285 tre_catenation_t *cat = node->obj;
2286 STACK_PUSHX(stack, voidptr, cat->right);
2287 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2288 STACK_PUSHX(stack, voidptr, cat->left);
2289 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2294 tre_iteration_t *iter = node->obj;
2295 STACK_PUSHX(stack, int, pos_add);
2296 STACK_PUSHX(stack, voidptr, node);
2297 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2298 STACK_PUSHX(stack, voidptr, iter->arg);
2299 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2300 /* If we are going to expand this node at EXPAND_AFTER_ITER
2301 then don't increase the `pos' fields of the nodes now, it
2302 will get done when expanding. */
2303 if (iter->min > 1 || iter->max > 1)
2313 case EXPAND_AFTER_ITER:
2315 tre_iteration_t *iter = node->obj;
2317 pos_add = tre_stack_pop_int(stack);
2318 pos_add_last = pos_add;
2319 if (iter->min > 1 || iter->max > 1)
2321 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2323 int pos_add_save = pos_add;
2325 /* Create a catenated sequence of copies of the node. */
2326 for (j = 0; j < iter->min; j++)
2328 tre_ast_node_t *copy;
2329 /* Remove tags from all but the last copy. */
2330 int flags = ((j + 1 < iter->min)
2332 : COPY_MAXIMIZE_FIRST_TAG);
2333 pos_add_save = pos_add;
2334 status = tre_copy_ast(mem, stack, iter->arg, flags,
2335 &pos_add, tag_directions, ©,
2337 if (status != REG_OK)
2340 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2347 if (iter->max == -1)
2349 /* No upper limit. */
2350 pos_add_save = pos_add;
2351 status = tre_copy_ast(mem, stack, iter->arg, 0,
2352 &pos_add, NULL, &seq2, &max_pos);
2353 if (status != REG_OK)
2355 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2361 for (j = iter->min; j < iter->max; j++)
2363 tre_ast_node_t *tmp, *copy;
2364 pos_add_save = pos_add;
2365 status = tre_copy_ast(mem, stack, iter->arg, 0,
2366 &pos_add, NULL, ©, &max_pos);
2367 if (status != REG_OK)
2370 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2375 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2378 seq2 = tre_ast_new_union(mem, tmp, seq2);
2384 pos_add = pos_add_save;
2387 else if (seq2 != NULL)
2388 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2391 node->obj = seq1->obj;
2392 node->type = seq1->type;
2396 pos_add_total += pos_add - pos_add_last;
2397 if (iter_depth == 0)
2398 pos_add = pos_add_total;
2408 *position += pos_add_total;
2410 /* `max_pos' should never be larger than `*position' if the above
2411 code works, but just an extra safeguard let's make sure
2412 `*position' is set large enough so enough memory will be
2413 allocated for the transition table. */
2414 if (max_pos > *position)
2415 *position = max_pos;
2420 static tre_pos_and_tags_t *
2421 tre_set_empty(tre_mem_t mem)
2423 tre_pos_and_tags_t *new_set;
2425 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2426 if (new_set == NULL)
2429 new_set[0].position = -1;
2430 new_set[0].code_min = -1;
2431 new_set[0].code_max = -1;
2436 static tre_pos_and_tags_t *
2437 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2438 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2440 tre_pos_and_tags_t *new_set;
2442 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2443 if (new_set == NULL)
2446 new_set[0].position = position;
2447 new_set[0].code_min = code_min;
2448 new_set[0].code_max = code_max;
2449 new_set[0].class = class;
2450 new_set[0].neg_classes = neg_classes;
2451 new_set[0].backref = backref;
2452 new_set[1].position = -1;
2453 new_set[1].code_min = -1;
2454 new_set[1].code_max = -1;
2459 static tre_pos_and_tags_t *
2460 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2461 int *tags, int assertions)
2464 tre_pos_and_tags_t *new_set;
2468 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2469 for (s1 = 0; set1[s1].position >= 0; s1++);
2470 for (s2 = 0; set2[s2].position >= 0; s2++);
2471 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2475 for (s1 = 0; set1[s1].position >= 0; s1++)
2477 new_set[s1].position = set1[s1].position;
2478 new_set[s1].code_min = set1[s1].code_min;
2479 new_set[s1].code_max = set1[s1].code_max;
2480 new_set[s1].assertions = set1[s1].assertions | assertions;
2481 new_set[s1].class = set1[s1].class;
2482 new_set[s1].neg_classes = set1[s1].neg_classes;
2483 new_set[s1].backref = set1[s1].backref;
2484 if (set1[s1].tags == NULL && tags == NULL)
2485 new_set[s1].tags = NULL;
2488 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2489 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2490 * (i + num_tags + 1)));
2491 if (new_tags == NULL)
2493 for (j = 0; j < i; j++)
2494 new_tags[j] = set1[s1].tags[j];
2495 for (i = 0; i < num_tags; i++)
2496 new_tags[j + i] = tags[i];
2497 new_tags[j + i] = -1;
2498 new_set[s1].tags = new_tags;
2502 for (s2 = 0; set2[s2].position >= 0; s2++)
2504 new_set[s1 + s2].position = set2[s2].position;
2505 new_set[s1 + s2].code_min = set2[s2].code_min;
2506 new_set[s1 + s2].code_max = set2[s2].code_max;
2507 /* XXX - why not | assertions here as well? */
2508 new_set[s1 + s2].assertions = set2[s2].assertions;
2509 new_set[s1 + s2].class = set2[s2].class;
2510 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2511 new_set[s1 + s2].backref = set2[s2].backref;
2512 if (set2[s2].tags == NULL)
2513 new_set[s1 + s2].tags = NULL;
2516 for (i = 0; set2[s2].tags[i] >= 0; i++);
2517 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2518 if (new_tags == NULL)
2520 for (j = 0; j < i; j++)
2521 new_tags[j] = set2[s2].tags[j];
2523 new_set[s1 + s2].tags = new_tags;
2526 new_set[s1 + s2].position = -1;
2530 /* Finds the empty path through `node' which is the one that should be
2531 taken according to POSIX.2 rules, and adds the tags on that path to
2532 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2533 set to the number of tags seen on the path. */
2534 static reg_errcode_t
2535 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2536 int *assertions, int *params, int *num_tags_seen,
2541 tre_catenation_t *cat;
2542 tre_iteration_t *iter;
2544 int bottom = tre_stack_num_objects(stack);
2545 reg_errcode_t status = REG_OK;
2549 status = tre_stack_push_voidptr(stack, node);
2551 /* Walk through the tree recursively. */
2552 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2554 node = tre_stack_pop_voidptr(stack);
2559 lit = (tre_literal_t *)node->obj;
2560 switch (lit->code_min)
2563 if (lit->code_max >= 0)
2567 /* Add the tag to `tags'. */
2568 for (i = 0; tags[i] >= 0; i++)
2569 if (tags[i] == lit->code_max)
2573 tags[i] = lit->code_max;
2582 assert(lit->code_max >= 1
2583 || lit->code_max <= ASSERT_LAST);
2584 if (assertions != NULL)
2585 *assertions |= lit->code_max;
2596 /* Subexpressions starting earlier take priority over ones
2597 starting later, so we prefer the left subexpression over the
2598 right subexpression. */
2599 uni = (tre_union_t *)node->obj;
2600 if (uni->left->nullable)
2601 STACK_PUSHX(stack, voidptr, uni->left)
2602 else if (uni->right->nullable)
2603 STACK_PUSHX(stack, voidptr, uni->right)
2609 /* The path must go through both children. */
2610 cat = (tre_catenation_t *)node->obj;
2611 assert(cat->left->nullable);
2612 assert(cat->right->nullable);
2613 STACK_PUSHX(stack, voidptr, cat->left);
2614 STACK_PUSHX(stack, voidptr, cat->right);
2618 /* A match with an empty string is preferred over no match at
2619 all, so we go through the argument if possible. */
2620 iter = (tre_iteration_t *)node->obj;
2621 if (iter->arg->nullable)
2622 STACK_PUSHX(stack, voidptr, iter->arg);
2638 NFL_POST_CATENATION,
2640 } tre_nfl_stack_symbol_t;
2643 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2644 the nodes of the AST `tree'. */
2645 static reg_errcode_t
2646 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2648 int bottom = tre_stack_num_objects(stack);
2650 STACK_PUSHR(stack, voidptr, tree);
2651 STACK_PUSHR(stack, int, NFL_RECURSE);
2653 while (tre_stack_num_objects(stack) > bottom)
2655 tre_nfl_stack_symbol_t symbol;
2656 tre_ast_node_t *node;
2658 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2659 node = tre_stack_pop_voidptr(stack);
2667 tre_literal_t *lit = (tre_literal_t *)node->obj;
2668 if (IS_BACKREF(lit))
2670 /* Back references: nullable = false, firstpos = {i},
2673 node->firstpos = tre_set_one(mem, lit->position, 0,
2674 TRE_CHAR_MAX, 0, NULL, -1);
2675 if (!node->firstpos)
2677 node->lastpos = tre_set_one(mem, lit->position, 0,
2678 TRE_CHAR_MAX, 0, NULL,
2679 (int)lit->code_max);
2683 else if (lit->code_min < 0)
2685 /* Tags, empty strings, params, and zero width assertions:
2686 nullable = true, firstpos = {}, and lastpos = {}. */
2688 node->firstpos = tre_set_empty(mem);
2689 if (!node->firstpos)
2691 node->lastpos = tre_set_empty(mem);
2697 /* Literal at position i: nullable = false, firstpos = {i},
2701 tre_set_one(mem, lit->position, (int)lit->code_min,
2702 (int)lit->code_max, 0, NULL, -1);
2703 if (!node->firstpos)
2705 node->lastpos = tre_set_one(mem, lit->position,
2708 lit->u.class, lit->neg_classes,
2717 /* Compute the attributes for the two subtrees, and after that
2719 STACK_PUSHR(stack, voidptr, node);
2720 STACK_PUSHR(stack, int, NFL_POST_UNION);
2721 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2722 STACK_PUSHR(stack, int, NFL_RECURSE);
2723 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2724 STACK_PUSHR(stack, int, NFL_RECURSE);
2728 /* Compute the attributes for the two subtrees, and after that
2730 STACK_PUSHR(stack, voidptr, node);
2731 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2732 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2733 STACK_PUSHR(stack, int, NFL_RECURSE);
2734 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2735 STACK_PUSHR(stack, int, NFL_RECURSE);
2739 /* Compute the attributes for the subtree, and after that for
2741 STACK_PUSHR(stack, voidptr, node);
2742 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2743 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2744 STACK_PUSHR(stack, int, NFL_RECURSE);
2747 break; /* end case: NFL_RECURSE */
2749 case NFL_POST_UNION:
2751 tre_union_t *uni = (tre_union_t *)node->obj;
2752 node->nullable = uni->left->nullable || uni->right->nullable;
2753 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2754 uni->right->firstpos, NULL, 0);
2755 if (!node->firstpos)
2757 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2758 uni->right->lastpos, NULL, 0);
2764 case NFL_POST_ITERATION:
2766 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2768 if (iter->min == 0 || iter->arg->nullable)
2772 node->firstpos = iter->arg->firstpos;
2773 node->lastpos = iter->arg->lastpos;
2777 case NFL_POST_CATENATION:
2779 int num_tags, *tags, assertions, params_seen;
2781 reg_errcode_t status;
2782 tre_catenation_t *cat = node->obj;
2783 node->nullable = cat->left->nullable && cat->right->nullable;
2785 /* Compute firstpos. */
2786 if (cat->left->nullable)
2788 /* The left side matches the empty string. Make a first pass
2789 with tre_match_empty() to get the number of tags and
2791 status = tre_match_empty(stack, cat->left,
2792 NULL, NULL, NULL, &num_tags,
2794 if (status != REG_OK)
2796 /* Allocate arrays for the tags and parameters. */
2797 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2802 /* Second pass with tre_mach_empty() to get the list of
2803 tags and parameters. */
2804 status = tre_match_empty(stack, cat->left, tags,
2805 &assertions, params, NULL, NULL);
2806 if (status != REG_OK)
2812 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2815 if (!node->firstpos)
2820 node->firstpos = cat->left->firstpos;
2823 /* Compute lastpos. */
2824 if (cat->right->nullable)
2826 /* The right side matches the empty string. Make a first pass
2827 with tre_match_empty() to get the number of tags and
2829 status = tre_match_empty(stack, cat->right,
2830 NULL, NULL, NULL, &num_tags,
2832 if (status != REG_OK)
2834 /* Allocate arrays for the tags and parameters. */
2835 tags = xmalloc(sizeof(int) * (num_tags + 1));
2840 /* Second pass with tre_mach_empty() to get the list of
2841 tags and parameters. */
2842 status = tre_match_empty(stack, cat->right, tags,
2843 &assertions, params, NULL, NULL);
2844 if (status != REG_OK)
2850 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2858 node->lastpos = cat->right->lastpos;
2873 /* Adds a transition from each position in `p1' to each position in `p2'. */
2874 static reg_errcode_t
2875 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2876 tre_tnfa_transition_t *transitions,
2877 int *counts, int *offs)
2879 tre_pos_and_tags_t *orig_p2 = p2;
2880 tre_tnfa_transition_t *trans;
2881 int i, j, k, l, dup, prev_p2_pos;
2883 if (transitions != NULL)
2884 while (p1->position >= 0)
2888 while (p2->position >= 0)
2890 /* Optimization: if this position was already handled, skip it. */
2891 if (p2->position == prev_p2_pos)
2896 prev_p2_pos = p2->position;
2897 /* Set `trans' to point to the next unused transition from
2898 position `p1->position'. */
2899 trans = transitions + offs[p1->position];
2900 while (trans->state != NULL)
2903 /* If we find a previous transition from `p1->position' to
2904 `p2->position', it is overwritten. This can happen only
2905 if there are nested loops in the regexp, like in "((a)*)*".
2906 In POSIX.2 repetition using the outer loop is always
2907 preferred over using the inner loop. Therefore the
2908 transition for the inner loop is useless and can be thrown
2910 /* XXX - The same position is used for all nodes in a bracket
2911 expression, so this optimization cannot be used (it will
2912 break bracket expressions) unless I figure out a way to
2914 if (trans->state_id == p2->position)
2922 if (trans->state == NULL)
2923 (trans + 1)->state = NULL;
2924 /* Use the character ranges, assertions, etc. from `p1' for
2925 the transition from `p1' to `p2'. */
2926 trans->code_min = p1->code_min;
2927 trans->code_max = p1->code_max;
2928 trans->state = transitions + offs[p2->position];
2929 trans->state_id = p2->position;
2930 trans->assertions = p1->assertions | p2->assertions
2931 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2932 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2933 if (p1->backref >= 0)
2935 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2936 assert(p2->backref < 0);
2937 trans->u.backref = p1->backref;
2938 trans->assertions |= ASSERT_BACKREF;
2941 trans->u.class = p1->class;
2942 if (p1->neg_classes != NULL)
2944 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2945 trans->neg_classes =
2946 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2947 if (trans->neg_classes == NULL)
2949 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2950 trans->neg_classes[i] = p1->neg_classes[i];
2951 trans->neg_classes[i] = (tre_ctype_t)0;
2954 trans->neg_classes = NULL;
2956 /* Find out how many tags this transition has. */
2958 if (p1->tags != NULL)
2959 while(p1->tags[i] >= 0)
2962 if (p2->tags != NULL)
2963 while(p2->tags[j] >= 0)
2966 /* If we are overwriting a transition, free the old tag array. */
2967 if (trans->tags != NULL)
2971 /* If there were any tags, allocate an array and fill it. */
2974 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2978 if (p1->tags != NULL)
2979 while(p1->tags[i] >= 0)
2981 trans->tags[i] = p1->tags[i];
2986 if (p2->tags != NULL)
2987 while (p2->tags[j] >= 0)
2989 /* Don't add duplicates. */
2991 for (k = 0; k < i; k++)
2992 if (trans->tags[k] == p2->tags[j])
2998 trans->tags[l++] = p2->tags[j];
3001 trans->tags[l] = -1;
3009 /* Compute a maximum limit for the number of transitions leaving
3011 while (p1->position >= 0)
3014 while (p2->position >= 0)
3016 counts[p1->position]++;
3024 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3025 labelled with one character range (there are no transitions on empty
3026 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3028 static reg_errcode_t
3029 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3030 int *counts, int *offs)
3033 tre_catenation_t *cat;
3034 tre_iteration_t *iter;
3035 reg_errcode_t errcode = REG_OK;
3037 /* XXX - recurse using a stack!. */
3043 uni = (tre_union_t *)node->obj;
3044 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3045 if (errcode != REG_OK)
3047 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3051 cat = (tre_catenation_t *)node->obj;
3052 /* Add a transition from each position in cat->left->lastpos
3053 to each position in cat->right->firstpos. */
3054 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3055 transitions, counts, offs);
3056 if (errcode != REG_OK)
3058 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3059 if (errcode != REG_OK)
3061 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3065 iter = (tre_iteration_t *)node->obj;
3066 assert(iter->max == -1 || iter->max == 1);
3068 if (iter->max == -1)
3070 assert(iter->min == 0 || iter->min == 1);
3071 /* Add a transition from each last position in the iterated
3072 expression to each first position. */
3073 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3074 transitions, counts, offs);
3075 if (errcode != REG_OK)
3078 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3085 #define ERROR_EXIT(err) \
3089 if (/*CONSTCOND*/1) \
3092 while (/*CONSTCOND*/0)
3096 regcomp(regex_t *preg, const char *regex, int cflags)
3099 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3100 tre_pos_and_tags_t *p;
3101 int *counts = NULL, *offs = NULL;
3103 tre_tnfa_transition_t *transitions, *initial;
3104 tre_tnfa_t *tnfa = NULL;
3105 tre_submatch_data_t *submatch_data;
3106 tre_tag_direction_t *tag_directions = NULL;
3107 reg_errcode_t errcode;
3110 /* Parse context. */
3111 tre_parse_ctx_t parse_ctx;
3113 /* Allocate a stack used throughout the compilation process for various
3115 stack = tre_stack_new(512, 10240, 128);
3118 /* Allocate a fast memory allocator. */
3119 mem = tre_mem_new();
3122 tre_stack_destroy(stack);
3126 /* Parse the regexp. */
3127 memset(&parse_ctx, 0, sizeof(parse_ctx));
3128 parse_ctx.mem = mem;
3129 parse_ctx.stack = stack;
3130 parse_ctx.re = regex;
3131 parse_ctx.cflags = cflags;
3132 parse_ctx.max_backref = -1;
3133 errcode = tre_parse(&parse_ctx);
3134 if (errcode != REG_OK)
3135 ERROR_EXIT(errcode);
3136 preg->re_nsub = parse_ctx.submatch_id - 1;
3137 tree = parse_ctx.result;
3139 /* Back references and approximate matching cannot currently be used
3140 in the same regexp. */
3141 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3142 ERROR_EXIT(REG_BADPAT);
3145 tre_ast_print(tree);
3146 #endif /* TRE_DEBUG */
3148 /* Referring to nonexistent subexpressions is illegal. */
3149 if (parse_ctx.max_backref > (int)preg->re_nsub)
3150 ERROR_EXIT(REG_ESUBREG);
3152 /* Allocate the TNFA struct. */
3153 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3155 ERROR_EXIT(REG_ESPACE);
3156 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3157 tnfa->have_approx = parse_ctx.have_approx;
3158 tnfa->num_submatches = parse_ctx.submatch_id;
3160 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3161 regexp does not have back references, this can be skipped. */
3162 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3165 /* Figure out how many tags we will need. */
3166 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3167 if (errcode != REG_OK)
3168 ERROR_EXIT(errcode);
3170 if (tnfa->num_tags > 0)
3172 tag_directions = xmalloc(sizeof(*tag_directions)
3173 * (tnfa->num_tags + 1));
3174 if (tag_directions == NULL)
3175 ERROR_EXIT(REG_ESPACE);
3176 tnfa->tag_directions = tag_directions;
3177 memset(tag_directions, -1,
3178 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3180 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3181 sizeof(tnfa->minimal_tags));
3182 if (tnfa->minimal_tags == NULL)
3183 ERROR_EXIT(REG_ESPACE);
3185 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3186 sizeof(*submatch_data));
3187 if (submatch_data == NULL)
3188 ERROR_EXIT(REG_ESPACE);
3189 tnfa->submatch_data = submatch_data;
3191 errcode = tre_add_tags(mem, stack, tree, tnfa);
3192 if (errcode != REG_OK)
3193 ERROR_EXIT(errcode);
3197 /* Expand iteration nodes. */
3198 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3199 tag_directions, &tnfa->params_depth);
3200 if (errcode != REG_OK)
3201 ERROR_EXIT(errcode);
3203 /* Add a dummy node for the final state.
3204 XXX - For certain patterns this dummy node can be optimized away,
3205 for example "a*" or "ab*". Figure out a simple way to detect
3206 this possibility. */
3208 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3209 if (tmp_ast_r == NULL)
3210 ERROR_EXIT(REG_ESPACE);
3212 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3214 ERROR_EXIT(REG_ESPACE);
3216 errcode = tre_compute_nfl(mem, stack, tree);
3217 if (errcode != REG_OK)
3218 ERROR_EXIT(errcode);
3220 counts = xmalloc(sizeof(int) * parse_ctx.position);
3222 ERROR_EXIT(REG_ESPACE);
3224 offs = xmalloc(sizeof(int) * parse_ctx.position);
3226 ERROR_EXIT(REG_ESPACE);
3228 for (i = 0; i < parse_ctx.position; i++)
3230 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3233 for (i = 0; i < parse_ctx.position; i++)
3236 add += counts[i] + 1;
3239 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3240 if (transitions == NULL)
3241 ERROR_EXIT(REG_ESPACE);
3242 tnfa->transitions = transitions;
3243 tnfa->num_transitions = add;
3245 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3246 if (errcode != REG_OK)
3247 ERROR_EXIT(errcode);
3249 tnfa->firstpos_chars = NULL;
3253 while (p->position >= 0)
3259 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3260 if (initial == NULL)
3261 ERROR_EXIT(REG_ESPACE);
3262 tnfa->initial = initial;
3265 for (p = tree->firstpos; p->position >= 0; p++)
3267 initial[i].state = transitions + offs[p->position];
3268 initial[i].state_id = p->position;
3269 initial[i].tags = NULL;
3270 /* Copy the arrays p->tags, and p->params, they are allocated
3271 from a tre_mem object. */
3275 for (j = 0; p->tags[j] >= 0; j++);
3276 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3277 if (!initial[i].tags)
3278 ERROR_EXIT(REG_ESPACE);
3279 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3281 initial[i].assertions = p->assertions;
3284 initial[i].state = NULL;
3286 tnfa->num_transitions = add;
3287 tnfa->final = transitions + offs[tree->lastpos[0].position];
3288 tnfa->num_states = parse_ctx.position;
3289 tnfa->cflags = cflags;
3291 tre_mem_destroy(mem);
3292 tre_stack_destroy(stack);
3296 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3300 /* Free everything that was allocated and return the error code. */
3301 tre_mem_destroy(mem);
3303 tre_stack_destroy(stack);
3308 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3317 regfree(regex_t *preg)
3321 tre_tnfa_transition_t *trans;
3323 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3327 for (i = 0; i < tnfa->num_transitions; i++)
3328 if (tnfa->transitions[i].state)
3330 if (tnfa->transitions[i].tags)
3331 xfree(tnfa->transitions[i].tags);
3332 if (tnfa->transitions[i].neg_classes)
3333 xfree(tnfa->transitions[i].neg_classes);
3335 if (tnfa->transitions)
3336 xfree(tnfa->transitions);
3340 for (trans = tnfa->initial; trans->state; trans++)
3345 xfree(tnfa->initial);
3348 if (tnfa->submatch_data)
3350 for (i = 0; i < tnfa->num_submatches; i++)
3351 if (tnfa->submatch_data[i].parents)
3352 xfree(tnfa->submatch_data[i].parents);
3353 xfree(tnfa->submatch_data);
3356 if (tnfa->tag_directions)
3357 xfree(tnfa->tag_directions);
3358 if (tnfa->firstpos_chars)
3359 xfree(tnfa->firstpos_chars);
3360 if (tnfa->minimal_tags)
3361 xfree(tnfa->minimal_tags);