2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
967 if (!ctx->nofirstsub)
969 STACK_PUSH(stack, int, ctx->submatch_id);
970 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
973 STACK_PUSH(stack, int, PARSE_RE);
974 ctx->re_start = ctx->re;
977 /* The following is basically just a recursive descent parser. I use
978 an explicit stack instead of recursive functions mostly because of
979 two reasons: compatibility with systems which have an overflowable
980 call stack, and efficiency (both in lines of code and speed). */
981 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
983 if (status != REG_OK)
985 symbol = tre_stack_pop_int(stack);
989 /* Parse a full regexp. A regexp is one or more branches,
990 separated by the union operator `|'. */
991 if (ctx->cflags & REG_EXTENDED)
992 STACK_PUSHX(stack, int, PARSE_UNION);
993 STACK_PUSHX(stack, int, PARSE_BRANCH);
997 /* Parse a branch. A branch is one or more pieces, concatenated.
998 A piece is an atom possibly followed by a postfix operator. */
999 STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 STACK_PUSHX(stack, int, PARSE_PIECE);
1004 /* Parse a piece. A piece is an atom possibly followed by one
1005 or more postfix operators. */
1006 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1007 STACK_PUSHX(stack, int, PARSE_ATOM);
1010 case PARSE_CATENATION:
1011 /* If the expression has not ended, parse another piece. */
1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1019 if ((ctx->cflags & REG_EXTENDED
1020 && c == CHAR_RPAREN && depth > 0)
1021 || (!(ctx->cflags & REG_EXTENDED)
1022 && (c == CHAR_BACKSLASH
1023 && *(ctx->re + 1) == CHAR_RPAREN)))
1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 status = REG_EPAREN;
1028 if (!(ctx->cflags & REG_EXTENDED))
1034 /* Default case, left associative concatenation. */
1035 STACK_PUSHX(stack, int, PARSE_CATENATION);
1036 STACK_PUSHX(stack, voidptr, result);
1037 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1038 STACK_PUSHX(stack, int, PARSE_PIECE);
1043 case PARSE_POST_CATENATION:
1045 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1046 tre_ast_node_t *tmp_node;
1047 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1060 STACK_PUSHX(stack, int, PARSE_UNION);
1061 STACK_PUSHX(stack, voidptr, result);
1062 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1063 STACK_PUSHX(stack, int, PARSE_BRANCH);
1076 case PARSE_POST_UNION:
1078 tre_ast_node_t *tmp_node;
1079 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1080 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1088 /* Parse postfix operators. */
1094 case CHAR_QUESTIONMARK:
1095 if (!(ctx->cflags & REG_EXTENDED))
1100 tre_ast_node_t *tmp_node;
1105 if (*ctx->re == CHAR_PLUS)
1107 if (*ctx->re == CHAR_QUESTIONMARK)
1111 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1113 if (tmp_node == NULL)
1116 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1120 case CHAR_BACKSLASH:
1121 /* "\{" is special without REG_EXTENDED */
1122 if (!(ctx->cflags & REG_EXTENDED)
1123 && *(ctx->re + 1) == CHAR_LBRACE)
1132 /* "{" is literal without REG_EXTENDED */
1133 if (!(ctx->cflags & REG_EXTENDED))
1139 status = tre_parse_bound(ctx, &result);
1140 if (status != REG_OK)
1142 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1148 /* Parse an atom. An atom is a regular expression enclosed in `()',
1149 an empty set of `()', a bracket expression, `.', `^', `$',
1150 a `\' followed by a character, or a single character. */
1154 case CHAR_LPAREN: /* parenthesized subexpression */
1156 if (ctx->cflags & REG_EXTENDED)
1162 /* First parse a whole RE, then mark the resulting tree
1164 STACK_PUSHX(stack, int, ctx->submatch_id);
1165 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1166 STACK_PUSHX(stack, int, PARSE_RE);
1174 case CHAR_LBRACKET: /* bracket expression */
1176 status = tre_parse_bracket(ctx, &result);
1177 if (status != REG_OK)
1181 case CHAR_BACKSLASH:
1182 /* If this is "\(" or "\)" chew off the backslash and
1184 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1189 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
1194 /* If a macro is used, parse the expanded macro recursively. */
1196 const char *buf = tre_expand_macro(ctx->re + 1);
1199 tre_parse_ctx_t subctx;
1200 memcpy(&subctx, ctx, sizeof(subctx));
1202 subctx.nofirstsub = 1;
1203 status = tre_parse(&subctx);
1204 if (status != REG_OK)
1207 ctx->position = subctx.position;
1208 result = subctx.result;
1214 /* Trailing backslash. */
1221 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1226 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1227 ASSERT_AT_WB_NEG, -1);
1231 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1236 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1242 if (ctx->re[0] != CHAR_LBRACE)
1244 /* 8 bit hex char. */
1245 char tmp[3] = {0, 0, 0};
1248 if (tre_isxdigit(ctx->re[0]))
1250 tmp[0] = (char)ctx->re[0];
1253 if (tre_isxdigit(ctx->re[0]))
1255 tmp[1] = (char)ctx->re[0];
1258 val = strtol(tmp, NULL, 16);
1259 result = tre_ast_new_literal(ctx->mem, (int)val,
1260 (int)val, ctx->position);
1271 while (*ctx->re && i < sizeof tmp)
1273 if (ctx->re[0] == CHAR_RBRACE)
1275 if (tre_isxdigit(ctx->re[0]))
1277 tmp[i] = (char)ctx->re[0];
1286 val = strtol(tmp, NULL, 16);
1287 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1295 if (tre_isdigit(*ctx->re))
1297 /* Back reference. */
1298 int val = *ctx->re - '0';
1299 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1304 ctx->max_backref = MAX(val, ctx->max_backref);
1309 /* Escaped character. */
1310 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1321 case CHAR_PERIOD: /* the any-symbol */
1322 if (ctx->cflags & REG_NEWLINE)
1324 tre_ast_node_t *tmp1;
1325 tre_ast_node_t *tmp2;
1326 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1330 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1334 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1341 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1350 case CHAR_CARET: /* beginning of line assertion */
1351 /* '^' has a special meaning everywhere in EREs, and at
1352 beginning of BRE. */
1353 if (ctx->cflags & REG_EXTENDED
1354 || ctx->re == ctx->re_start)
1356 if (!(ctx->cflags & REG_EXTENDED))
1357 STACK_PUSHX(stack, int, PARSE_CATENATION);
1358 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1368 case CHAR_DOLLAR: /* end of line assertion. */
1369 /* '$' is special everywhere in EREs, and in the end of the
1371 if (ctx->cflags & REG_EXTENDED
1374 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1391 case CHAR_QUESTIONMARK:
1392 if (!(ctx->cflags & REG_EXTENDED))
1397 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1405 clen = mbtowc(&wc, ctx->re, -1);
1406 if (clen<0) clen=1, wc=WEOF;
1408 /* Note that we can't use an tre_isalpha() test here, since there
1409 may be characters which are alphabetic but neither upper or
1411 if (ctx->cflags & REG_ICASE
1412 && (tre_isupper(wc) || tre_islower(wc)))
1414 tre_ast_node_t *tmp1;
1415 tre_ast_node_t *tmp2;
1417 /* XXX - Can there be more than one opposite-case
1418 counterpoints for some character in some locale? Or
1419 more than two characters which all should be regarded
1420 the same character if case is ignored? If yes, there
1421 does not seem to be a portable way to detect it. I guess
1422 that at least for multi-character collating elements there
1423 could be several opposite-case counterpoints, but they
1424 cannot be supported portably anyway. */
1425 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1430 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1435 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1441 result = tre_ast_new_literal(ctx->mem, wc, wc,
1452 case PARSE_MARK_FOR_SUBMATCH:
1454 int submatch_id = tre_stack_pop_int(stack);
1456 if (result->submatch_id >= 0)
1458 tre_ast_node_t *n, *tmp_node;
1459 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1462 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1463 if (tmp_node == NULL)
1465 tmp_node->num_submatches = result->num_submatches;
1468 result->submatch_id = submatch_id;
1469 result->num_submatches++;
1473 case PARSE_RESTORE_CFLAGS:
1474 ctx->cflags = tre_stack_pop_int(stack);
1483 /* Check for missing closing parentheses. */
1487 if (status == REG_OK)
1488 ctx->result = result;
1495 /***********************************************************************
1497 ***********************************************************************/
1502 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1507 Algorithms to setup tags so that submatch addressing can be done.
1511 /* Inserts a catenation node to the root of the tree given in `node'.
1512 As the left child a new tag with number `tag_id' to `node' is added,
1513 and the right child is the old root. */
1514 static reg_errcode_t
1515 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1517 tre_catenation_t *c;
1519 c = tre_mem_alloc(mem, sizeof(*c));
1522 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1523 if (c->left == NULL)
1525 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1526 if (c->right == NULL)
1529 c->right->obj = node->obj;
1530 c->right->type = node->type;
1531 c->right->nullable = -1;
1532 c->right->submatch_id = -1;
1533 c->right->firstpos = NULL;
1534 c->right->lastpos = NULL;
1535 c->right->num_tags = 0;
1537 node->type = CATENATION;
1541 /* Inserts a catenation node to the root of the tree given in `node'.
1542 As the right child a new tag with number `tag_id' to `node' is added,
1543 and the left child is the old root. */
1544 static reg_errcode_t
1545 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1547 tre_catenation_t *c;
1549 c = tre_mem_alloc(mem, sizeof(*c));
1552 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1553 if (c->right == NULL)
1555 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1556 if (c->left == NULL)
1559 c->left->obj = node->obj;
1560 c->left->type = node->type;
1561 c->left->nullable = -1;
1562 c->left->submatch_id = -1;
1563 c->left->firstpos = NULL;
1564 c->left->lastpos = NULL;
1565 c->left->num_tags = 0;
1567 node->type = CATENATION;
1573 ADDTAGS_AFTER_ITERATION,
1574 ADDTAGS_AFTER_UNION_LEFT,
1575 ADDTAGS_AFTER_UNION_RIGHT,
1576 ADDTAGS_AFTER_CAT_LEFT,
1577 ADDTAGS_AFTER_CAT_RIGHT,
1578 ADDTAGS_SET_SUBMATCH_END
1579 } tre_addtags_symbol_t;
1588 /* Go through `regset' and set submatch data for submatches that are
1591 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1595 for (i = 0; regset[i] >= 0; i++)
1597 int id = regset[i] / 2;
1598 int start = !(regset[i] % 2);
1600 tnfa->submatch_data[id].so_tag = tag;
1602 tnfa->submatch_data[id].eo_tag = tag;
1608 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1609 subexpressions marked for submatch addressing can be traced. */
1610 static reg_errcode_t
1611 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1614 reg_errcode_t status = REG_OK;
1615 tre_addtags_symbol_t symbol;
1616 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1617 int bottom = tre_stack_num_objects(stack);
1618 /* True for first pass (counting number of needed tags) */
1619 int first_pass = (mem == NULL || tnfa == NULL);
1620 int *regset, *orig_regset;
1621 int num_tags = 0; /* Total number of tags. */
1622 int num_minimals = 0; /* Number of special minimal tags. */
1623 int tag = 0; /* The tag that is to be added next. */
1624 int next_tag = 1; /* Next tag to use after this one. */
1625 int *parents; /* Stack of submatches the current submatch is
1627 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1628 tre_tag_states_t *saved_states;
1630 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1634 tnfa->minimal_tags[0] = -1;
1637 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1641 orig_regset = regset;
1643 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1644 if (parents == NULL)
1651 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1652 if (saved_states == NULL)
1661 for (i = 0; i <= tnfa->num_submatches; i++)
1662 saved_states[i].tag = -1;
1665 STACK_PUSH(stack, voidptr, node);
1666 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1668 while (tre_stack_num_objects(stack) > bottom)
1670 if (status != REG_OK)
1673 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1677 case ADDTAGS_SET_SUBMATCH_END:
1679 int id = tre_stack_pop_int(stack);
1682 /* Add end of this submatch to regset. */
1683 for (i = 0; regset[i] >= 0; i++);
1684 regset[i] = id * 2 + 1;
1687 /* Pop this submatch from the parents stack. */
1688 for (i = 0; parents[i] >= 0; i++);
1689 parents[i - 1] = -1;
1693 case ADDTAGS_RECURSE:
1694 node = tre_stack_pop_voidptr(stack);
1696 if (node->submatch_id >= 0)
1698 int id = node->submatch_id;
1702 /* Add start of this submatch to regset. */
1703 for (i = 0; regset[i] >= 0; i++);
1709 for (i = 0; parents[i] >= 0; i++);
1710 tnfa->submatch_data[id].parents = NULL;
1713 int *p = xmalloc(sizeof(*p) * (i + 1));
1716 status = REG_ESPACE;
1719 assert(tnfa->submatch_data[id].parents == NULL);
1720 tnfa->submatch_data[id].parents = p;
1721 for (i = 0; parents[i] >= 0; i++)
1727 /* Add end of this submatch to regset after processing this
1729 STACK_PUSHX(stack, int, node->submatch_id);
1730 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1737 tre_literal_t *lit = node->obj;
1739 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1744 /* Regset is not empty, so add a tag before the
1745 literal or backref. */
1748 status = tre_add_tag_left(mem, node, tag);
1749 tnfa->tag_directions[tag] = direction;
1750 if (minimal_tag >= 0)
1752 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1753 tnfa->minimal_tags[i] = tag;
1754 tnfa->minimal_tags[i + 1] = minimal_tag;
1755 tnfa->minimal_tags[i + 2] = -1;
1759 tre_purge_regset(regset, tnfa, tag);
1774 assert(!IS_TAG(lit));
1780 tre_catenation_t *cat = node->obj;
1781 tre_ast_node_t *left = cat->left;
1782 tre_ast_node_t *right = cat->right;
1783 int reserved_tag = -1;
1786 /* After processing right child. */
1787 STACK_PUSHX(stack, voidptr, node);
1788 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1790 /* Process right child. */
1791 STACK_PUSHX(stack, voidptr, right);
1792 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1794 /* After processing left child. */
1795 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1796 if (left->num_tags > 0 && right->num_tags > 0)
1798 /* Reserve the next tag to the right child. */
1799 reserved_tag = next_tag;
1802 STACK_PUSHX(stack, int, reserved_tag);
1803 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1805 /* Process left child. */
1806 STACK_PUSHX(stack, voidptr, left);
1807 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1813 tre_iteration_t *iter = node->obj;
1817 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1821 STACK_PUSHX(stack, int, tag);
1822 STACK_PUSHX(stack, int, iter->minimal);
1824 STACK_PUSHX(stack, voidptr, node);
1825 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1827 STACK_PUSHX(stack, voidptr, iter->arg);
1828 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1830 /* Regset is not empty, so add a tag here. */
1831 if (regset[0] >= 0 || iter->minimal)
1836 status = tre_add_tag_left(mem, node, tag);
1838 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1840 tnfa->tag_directions[tag] = direction;
1841 if (minimal_tag >= 0)
1843 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1844 tnfa->minimal_tags[i] = tag;
1845 tnfa->minimal_tags[i + 1] = minimal_tag;
1846 tnfa->minimal_tags[i + 2] = -1;
1850 tre_purge_regset(regset, tnfa, tag);
1858 direction = TRE_TAG_MINIMIZE;
1863 tre_union_t *uni = node->obj;
1864 tre_ast_node_t *left = uni->left;
1865 tre_ast_node_t *right = uni->right;
1871 left_tag = next_tag;
1872 right_tag = next_tag + 1;
1877 right_tag = next_tag;
1880 /* After processing right child. */
1881 STACK_PUSHX(stack, int, right_tag);
1882 STACK_PUSHX(stack, int, left_tag);
1883 STACK_PUSHX(stack, voidptr, regset);
1884 STACK_PUSHX(stack, int, regset[0] >= 0);
1885 STACK_PUSHX(stack, voidptr, node);
1886 STACK_PUSHX(stack, voidptr, right);
1887 STACK_PUSHX(stack, voidptr, left);
1888 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1890 /* Process right child. */
1891 STACK_PUSHX(stack, voidptr, right);
1892 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1894 /* After processing left child. */
1895 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1897 /* Process left child. */
1898 STACK_PUSHX(stack, voidptr, left);
1899 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1901 /* Regset is not empty, so add a tag here. */
1907 status = tre_add_tag_left(mem, node, tag);
1908 tnfa->tag_directions[tag] = direction;
1909 if (minimal_tag >= 0)
1911 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1912 tnfa->minimal_tags[i] = tag;
1913 tnfa->minimal_tags[i + 1] = minimal_tag;
1914 tnfa->minimal_tags[i + 2] = -1;
1918 tre_purge_regset(regset, tnfa, tag);
1927 if (node->num_submatches > 0)
1929 /* The next two tags are reserved for markers. */
1939 if (node->submatch_id >= 0)
1942 /* Push this submatch on the parents stack. */
1943 for (i = 0; parents[i] >= 0; i++);
1944 parents[i] = node->submatch_id;
1945 parents[i + 1] = -1;
1948 break; /* end case: ADDTAGS_RECURSE */
1950 case ADDTAGS_AFTER_ITERATION:
1954 node = tre_stack_pop_voidptr(stack);
1957 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1958 + tre_stack_pop_int(stack);
1963 minimal = tre_stack_pop_int(stack);
1964 enter_tag = tre_stack_pop_int(stack);
1966 minimal_tag = enter_tag;
1972 direction = TRE_TAG_MINIMIZE;
1974 direction = TRE_TAG_MAXIMIZE;
1979 case ADDTAGS_AFTER_CAT_LEFT:
1981 int new_tag = tre_stack_pop_int(stack);
1982 next_tag = tre_stack_pop_int(stack);
1990 case ADDTAGS_AFTER_CAT_RIGHT:
1991 node = tre_stack_pop_voidptr(stack);
1993 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1994 + ((tre_catenation_t *)node->obj)->right->num_tags;
1997 case ADDTAGS_AFTER_UNION_LEFT:
1998 /* Lift the bottom of the `regset' array so that when processing
1999 the right operand the items currently in the array are
2000 invisible. The original bottom was saved at ADDTAGS_UNION and
2001 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2002 while (*regset >= 0)
2006 case ADDTAGS_AFTER_UNION_RIGHT:
2008 int added_tags, tag_left, tag_right;
2009 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2010 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2011 node = tre_stack_pop_voidptr(stack);
2012 added_tags = tre_stack_pop_int(stack);
2015 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2016 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2017 + ((node->num_submatches > 0) ? 2 : 0);
2019 regset = tre_stack_pop_voidptr(stack);
2020 tag_left = tre_stack_pop_int(stack);
2021 tag_right = tre_stack_pop_int(stack);
2023 /* Add tags after both children, the left child gets a smaller
2024 tag than the right child. This guarantees that we prefer
2025 the left child over the right child. */
2026 /* XXX - This is not always necessary (if the children have
2027 tags which must be seen for every match of that child). */
2028 /* XXX - Check if this is the only place where tre_add_tag_right
2029 is used. If so, use tre_add_tag_left (putting the tag before
2030 the child as opposed after the child) and throw away
2031 tre_add_tag_right. */
2032 if (node->num_submatches > 0)
2036 status = tre_add_tag_right(mem, left, tag_left);
2037 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2038 status = tre_add_tag_right(mem, right, tag_right);
2039 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2043 direction = TRE_TAG_MAXIMIZE;
2051 } /* end switch(symbol) */
2052 } /* end while(tre_stack_num_objects(stack) > bottom) */
2055 tre_purge_regset(regset, tnfa, tag);
2057 if (!first_pass && minimal_tag >= 0)
2060 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2061 tnfa->minimal_tags[i] = tag;
2062 tnfa->minimal_tags[i + 1] = minimal_tag;
2063 tnfa->minimal_tags[i + 2] = -1;
2068 assert(tree->num_tags == num_tags);
2069 tnfa->end_tag = num_tags;
2070 tnfa->num_tags = num_tags;
2071 tnfa->num_minimals = num_minimals;
2074 xfree(saved_states);
2081 AST to TNFA compilation routines.
2087 } tre_copyast_symbol_t;
2089 /* Flags for tre_copy_ast(). */
2090 #define COPY_REMOVE_TAGS 1
2091 #define COPY_MAXIMIZE_FIRST_TAG 2
2093 static reg_errcode_t
2094 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2095 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2096 tre_ast_node_t **copy, int *max_pos)
2098 reg_errcode_t status = REG_OK;
2099 int bottom = tre_stack_num_objects(stack);
2102 tre_ast_node_t **result = copy;
2103 tre_copyast_symbol_t symbol;
2105 STACK_PUSH(stack, voidptr, ast);
2106 STACK_PUSH(stack, int, COPY_RECURSE);
2108 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2110 tre_ast_node_t *node;
2111 if (status != REG_OK)
2114 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2117 case COPY_SET_RESULT_PTR:
2118 result = tre_stack_pop_voidptr(stack);
2121 node = tre_stack_pop_voidptr(stack);
2126 tre_literal_t *lit = node->obj;
2127 int pos = lit->position;
2128 int min = lit->code_min;
2129 int max = lit->code_max;
2130 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2132 /* XXX - e.g. [ab] has only one position but two
2133 nodes, so we are creating holes in the state space
2134 here. Not fatal, just wastes memory. */
2138 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2140 /* Change this tag to empty. */
2144 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2147 /* Maximize the first tag. */
2148 tag_directions[max] = TRE_TAG_MAXIMIZE;
2151 *result = tre_ast_new_literal(mem, min, max, pos);
2152 if (*result == NULL)
2153 status = REG_ESPACE;
2161 tre_union_t *uni = node->obj;
2163 *result = tre_ast_new_union(mem, uni->left, uni->right);
2164 if (*result == NULL)
2166 status = REG_ESPACE;
2169 tmp = (*result)->obj;
2170 result = &tmp->left;
2171 STACK_PUSHX(stack, voidptr, uni->right);
2172 STACK_PUSHX(stack, int, COPY_RECURSE);
2173 STACK_PUSHX(stack, voidptr, &tmp->right);
2174 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2175 STACK_PUSHX(stack, voidptr, uni->left);
2176 STACK_PUSHX(stack, int, COPY_RECURSE);
2181 tre_catenation_t *cat = node->obj;
2182 tre_catenation_t *tmp;
2183 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2184 if (*result == NULL)
2186 status = REG_ESPACE;
2189 tmp = (*result)->obj;
2192 result = &tmp->left;
2194 STACK_PUSHX(stack, voidptr, cat->right);
2195 STACK_PUSHX(stack, int, COPY_RECURSE);
2196 STACK_PUSHX(stack, voidptr, &tmp->right);
2197 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2198 STACK_PUSHX(stack, voidptr, cat->left);
2199 STACK_PUSHX(stack, int, COPY_RECURSE);
2204 tre_iteration_t *iter = node->obj;
2205 STACK_PUSHX(stack, voidptr, iter->arg);
2206 STACK_PUSHX(stack, int, COPY_RECURSE);
2207 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2208 iter->max, iter->minimal);
2209 if (*result == NULL)
2211 status = REG_ESPACE;
2214 iter = (*result)->obj;
2215 result = &iter->arg;
2225 *pos_add += num_copied;
2232 } tre_expand_ast_symbol_t;
2234 /* Expands each iteration node that has a finite nonzero minimum or maximum
2235 iteration count to a catenated sequence of copies of the node. */
2236 static reg_errcode_t
2237 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2238 int *position, tre_tag_direction_t *tag_directions,
2241 reg_errcode_t status = REG_OK;
2242 int bottom = tre_stack_num_objects(stack);
2244 int pos_add_total = 0;
2248 STACK_PUSHR(stack, voidptr, ast);
2249 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2250 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2252 tre_ast_node_t *node;
2253 tre_expand_ast_symbol_t symbol;
2255 if (status != REG_OK)
2258 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2259 node = tre_stack_pop_voidptr(stack);
2262 case EXPAND_RECURSE:
2267 tre_literal_t *lit= node->obj;
2268 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2270 lit->position += pos_add;
2271 if (lit->position > max_pos)
2272 max_pos = lit->position;
2278 tre_union_t *uni = node->obj;
2279 STACK_PUSHX(stack, voidptr, uni->right);
2280 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2281 STACK_PUSHX(stack, voidptr, uni->left);
2282 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2287 tre_catenation_t *cat = node->obj;
2288 STACK_PUSHX(stack, voidptr, cat->right);
2289 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2290 STACK_PUSHX(stack, voidptr, cat->left);
2291 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2296 tre_iteration_t *iter = node->obj;
2297 STACK_PUSHX(stack, int, pos_add);
2298 STACK_PUSHX(stack, voidptr, node);
2299 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2300 STACK_PUSHX(stack, voidptr, iter->arg);
2301 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2302 /* If we are going to expand this node at EXPAND_AFTER_ITER
2303 then don't increase the `pos' fields of the nodes now, it
2304 will get done when expanding. */
2305 if (iter->min > 1 || iter->max > 1)
2315 case EXPAND_AFTER_ITER:
2317 tre_iteration_t *iter = node->obj;
2319 pos_add = tre_stack_pop_int(stack);
2320 pos_add_last = pos_add;
2321 if (iter->min > 1 || iter->max > 1)
2323 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2325 int pos_add_save = pos_add;
2327 /* Create a catenated sequence of copies of the node. */
2328 for (j = 0; j < iter->min; j++)
2330 tre_ast_node_t *copy;
2331 /* Remove tags from all but the last copy. */
2332 int flags = ((j + 1 < iter->min)
2334 : COPY_MAXIMIZE_FIRST_TAG);
2335 pos_add_save = pos_add;
2336 status = tre_copy_ast(mem, stack, iter->arg, flags,
2337 &pos_add, tag_directions, ©,
2339 if (status != REG_OK)
2342 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2349 if (iter->max == -1)
2351 /* No upper limit. */
2352 pos_add_save = pos_add;
2353 status = tre_copy_ast(mem, stack, iter->arg, 0,
2354 &pos_add, NULL, &seq2, &max_pos);
2355 if (status != REG_OK)
2357 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2363 for (j = iter->min; j < iter->max; j++)
2365 tre_ast_node_t *tmp, *copy;
2366 pos_add_save = pos_add;
2367 status = tre_copy_ast(mem, stack, iter->arg, 0,
2368 &pos_add, NULL, ©, &max_pos);
2369 if (status != REG_OK)
2372 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2377 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2380 seq2 = tre_ast_new_union(mem, tmp, seq2);
2386 pos_add = pos_add_save;
2389 else if (seq2 != NULL)
2390 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2393 node->obj = seq1->obj;
2394 node->type = seq1->type;
2398 pos_add_total += pos_add - pos_add_last;
2399 if (iter_depth == 0)
2400 pos_add = pos_add_total;
2410 *position += pos_add_total;
2412 /* `max_pos' should never be larger than `*position' if the above
2413 code works, but just an extra safeguard let's make sure
2414 `*position' is set large enough so enough memory will be
2415 allocated for the transition table. */
2416 if (max_pos > *position)
2417 *position = max_pos;
2422 static tre_pos_and_tags_t *
2423 tre_set_empty(tre_mem_t mem)
2425 tre_pos_and_tags_t *new_set;
2427 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2428 if (new_set == NULL)
2431 new_set[0].position = -1;
2432 new_set[0].code_min = -1;
2433 new_set[0].code_max = -1;
2438 static tre_pos_and_tags_t *
2439 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2440 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2442 tre_pos_and_tags_t *new_set;
2444 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2445 if (new_set == NULL)
2448 new_set[0].position = position;
2449 new_set[0].code_min = code_min;
2450 new_set[0].code_max = code_max;
2451 new_set[0].class = class;
2452 new_set[0].neg_classes = neg_classes;
2453 new_set[0].backref = backref;
2454 new_set[1].position = -1;
2455 new_set[1].code_min = -1;
2456 new_set[1].code_max = -1;
2461 static tre_pos_and_tags_t *
2462 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2463 int *tags, int assertions)
2466 tre_pos_and_tags_t *new_set;
2470 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2471 for (s1 = 0; set1[s1].position >= 0; s1++);
2472 for (s2 = 0; set2[s2].position >= 0; s2++);
2473 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2477 for (s1 = 0; set1[s1].position >= 0; s1++)
2479 new_set[s1].position = set1[s1].position;
2480 new_set[s1].code_min = set1[s1].code_min;
2481 new_set[s1].code_max = set1[s1].code_max;
2482 new_set[s1].assertions = set1[s1].assertions | assertions;
2483 new_set[s1].class = set1[s1].class;
2484 new_set[s1].neg_classes = set1[s1].neg_classes;
2485 new_set[s1].backref = set1[s1].backref;
2486 if (set1[s1].tags == NULL && tags == NULL)
2487 new_set[s1].tags = NULL;
2490 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2491 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2492 * (i + num_tags + 1)));
2493 if (new_tags == NULL)
2495 for (j = 0; j < i; j++)
2496 new_tags[j] = set1[s1].tags[j];
2497 for (i = 0; i < num_tags; i++)
2498 new_tags[j + i] = tags[i];
2499 new_tags[j + i] = -1;
2500 new_set[s1].tags = new_tags;
2504 for (s2 = 0; set2[s2].position >= 0; s2++)
2506 new_set[s1 + s2].position = set2[s2].position;
2507 new_set[s1 + s2].code_min = set2[s2].code_min;
2508 new_set[s1 + s2].code_max = set2[s2].code_max;
2509 /* XXX - why not | assertions here as well? */
2510 new_set[s1 + s2].assertions = set2[s2].assertions;
2511 new_set[s1 + s2].class = set2[s2].class;
2512 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2513 new_set[s1 + s2].backref = set2[s2].backref;
2514 if (set2[s2].tags == NULL)
2515 new_set[s1 + s2].tags = NULL;
2518 for (i = 0; set2[s2].tags[i] >= 0; i++);
2519 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2520 if (new_tags == NULL)
2522 for (j = 0; j < i; j++)
2523 new_tags[j] = set2[s2].tags[j];
2525 new_set[s1 + s2].tags = new_tags;
2528 new_set[s1 + s2].position = -1;
2532 /* Finds the empty path through `node' which is the one that should be
2533 taken according to POSIX.2 rules, and adds the tags on that path to
2534 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2535 set to the number of tags seen on the path. */
2536 static reg_errcode_t
2537 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2538 int *assertions, int *params, int *num_tags_seen,
2543 tre_catenation_t *cat;
2544 tre_iteration_t *iter;
2546 int bottom = tre_stack_num_objects(stack);
2547 reg_errcode_t status = REG_OK;
2551 status = tre_stack_push_voidptr(stack, node);
2553 /* Walk through the tree recursively. */
2554 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2556 node = tre_stack_pop_voidptr(stack);
2561 lit = (tre_literal_t *)node->obj;
2562 switch (lit->code_min)
2565 if (lit->code_max >= 0)
2569 /* Add the tag to `tags'. */
2570 for (i = 0; tags[i] >= 0; i++)
2571 if (tags[i] == lit->code_max)
2575 tags[i] = lit->code_max;
2584 assert(lit->code_max >= 1
2585 || lit->code_max <= ASSERT_LAST);
2586 if (assertions != NULL)
2587 *assertions |= lit->code_max;
2598 /* Subexpressions starting earlier take priority over ones
2599 starting later, so we prefer the left subexpression over the
2600 right subexpression. */
2601 uni = (tre_union_t *)node->obj;
2602 if (uni->left->nullable)
2603 STACK_PUSHX(stack, voidptr, uni->left)
2604 else if (uni->right->nullable)
2605 STACK_PUSHX(stack, voidptr, uni->right)
2611 /* The path must go through both children. */
2612 cat = (tre_catenation_t *)node->obj;
2613 assert(cat->left->nullable);
2614 assert(cat->right->nullable);
2615 STACK_PUSHX(stack, voidptr, cat->left);
2616 STACK_PUSHX(stack, voidptr, cat->right);
2620 /* A match with an empty string is preferred over no match at
2621 all, so we go through the argument if possible. */
2622 iter = (tre_iteration_t *)node->obj;
2623 if (iter->arg->nullable)
2624 STACK_PUSHX(stack, voidptr, iter->arg);
2640 NFL_POST_CATENATION,
2642 } tre_nfl_stack_symbol_t;
2645 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2646 the nodes of the AST `tree'. */
2647 static reg_errcode_t
2648 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2650 int bottom = tre_stack_num_objects(stack);
2652 STACK_PUSHR(stack, voidptr, tree);
2653 STACK_PUSHR(stack, int, NFL_RECURSE);
2655 while (tre_stack_num_objects(stack) > bottom)
2657 tre_nfl_stack_symbol_t symbol;
2658 tre_ast_node_t *node;
2660 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2661 node = tre_stack_pop_voidptr(stack);
2669 tre_literal_t *lit = (tre_literal_t *)node->obj;
2670 if (IS_BACKREF(lit))
2672 /* Back references: nullable = false, firstpos = {i},
2675 node->firstpos = tre_set_one(mem, lit->position, 0,
2676 TRE_CHAR_MAX, 0, NULL, -1);
2677 if (!node->firstpos)
2679 node->lastpos = tre_set_one(mem, lit->position, 0,
2680 TRE_CHAR_MAX, 0, NULL,
2681 (int)lit->code_max);
2685 else if (lit->code_min < 0)
2687 /* Tags, empty strings, params, and zero width assertions:
2688 nullable = true, firstpos = {}, and lastpos = {}. */
2690 node->firstpos = tre_set_empty(mem);
2691 if (!node->firstpos)
2693 node->lastpos = tre_set_empty(mem);
2699 /* Literal at position i: nullable = false, firstpos = {i},
2703 tre_set_one(mem, lit->position, (int)lit->code_min,
2704 (int)lit->code_max, 0, NULL, -1);
2705 if (!node->firstpos)
2707 node->lastpos = tre_set_one(mem, lit->position,
2710 lit->u.class, lit->neg_classes,
2719 /* Compute the attributes for the two subtrees, and after that
2721 STACK_PUSHR(stack, voidptr, node);
2722 STACK_PUSHR(stack, int, NFL_POST_UNION);
2723 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2724 STACK_PUSHR(stack, int, NFL_RECURSE);
2725 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2726 STACK_PUSHR(stack, int, NFL_RECURSE);
2730 /* Compute the attributes for the two subtrees, and after that
2732 STACK_PUSHR(stack, voidptr, node);
2733 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2734 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2735 STACK_PUSHR(stack, int, NFL_RECURSE);
2736 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2737 STACK_PUSHR(stack, int, NFL_RECURSE);
2741 /* Compute the attributes for the subtree, and after that for
2743 STACK_PUSHR(stack, voidptr, node);
2744 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2745 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2746 STACK_PUSHR(stack, int, NFL_RECURSE);
2749 break; /* end case: NFL_RECURSE */
2751 case NFL_POST_UNION:
2753 tre_union_t *uni = (tre_union_t *)node->obj;
2754 node->nullable = uni->left->nullable || uni->right->nullable;
2755 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2756 uni->right->firstpos, NULL, 0);
2757 if (!node->firstpos)
2759 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2760 uni->right->lastpos, NULL, 0);
2766 case NFL_POST_ITERATION:
2768 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2770 if (iter->min == 0 || iter->arg->nullable)
2774 node->firstpos = iter->arg->firstpos;
2775 node->lastpos = iter->arg->lastpos;
2779 case NFL_POST_CATENATION:
2781 int num_tags, *tags, assertions, params_seen;
2783 reg_errcode_t status;
2784 tre_catenation_t *cat = node->obj;
2785 node->nullable = cat->left->nullable && cat->right->nullable;
2787 /* Compute firstpos. */
2788 if (cat->left->nullable)
2790 /* The left side matches the empty string. Make a first pass
2791 with tre_match_empty() to get the number of tags and
2793 status = tre_match_empty(stack, cat->left,
2794 NULL, NULL, NULL, &num_tags,
2796 if (status != REG_OK)
2798 /* Allocate arrays for the tags and parameters. */
2799 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2804 /* Second pass with tre_mach_empty() to get the list of
2805 tags and parameters. */
2806 status = tre_match_empty(stack, cat->left, tags,
2807 &assertions, params, NULL, NULL);
2808 if (status != REG_OK)
2814 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2817 if (!node->firstpos)
2822 node->firstpos = cat->left->firstpos;
2825 /* Compute lastpos. */
2826 if (cat->right->nullable)
2828 /* The right side matches the empty string. Make a first pass
2829 with tre_match_empty() to get the number of tags and
2831 status = tre_match_empty(stack, cat->right,
2832 NULL, NULL, NULL, &num_tags,
2834 if (status != REG_OK)
2836 /* Allocate arrays for the tags and parameters. */
2837 tags = xmalloc(sizeof(int) * (num_tags + 1));
2842 /* Second pass with tre_mach_empty() to get the list of
2843 tags and parameters. */
2844 status = tre_match_empty(stack, cat->right, tags,
2845 &assertions, params, NULL, NULL);
2846 if (status != REG_OK)
2852 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2860 node->lastpos = cat->right->lastpos;
2875 /* Adds a transition from each position in `p1' to each position in `p2'. */
2876 static reg_errcode_t
2877 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2878 tre_tnfa_transition_t *transitions,
2879 int *counts, int *offs)
2881 tre_pos_and_tags_t *orig_p2 = p2;
2882 tre_tnfa_transition_t *trans;
2883 int i, j, k, l, dup, prev_p2_pos;
2885 if (transitions != NULL)
2886 while (p1->position >= 0)
2890 while (p2->position >= 0)
2892 /* Optimization: if this position was already handled, skip it. */
2893 if (p2->position == prev_p2_pos)
2898 prev_p2_pos = p2->position;
2899 /* Set `trans' to point to the next unused transition from
2900 position `p1->position'. */
2901 trans = transitions + offs[p1->position];
2902 while (trans->state != NULL)
2905 /* If we find a previous transition from `p1->position' to
2906 `p2->position', it is overwritten. This can happen only
2907 if there are nested loops in the regexp, like in "((a)*)*".
2908 In POSIX.2 repetition using the outer loop is always
2909 preferred over using the inner loop. Therefore the
2910 transition for the inner loop is useless and can be thrown
2912 /* XXX - The same position is used for all nodes in a bracket
2913 expression, so this optimization cannot be used (it will
2914 break bracket expressions) unless I figure out a way to
2916 if (trans->state_id == p2->position)
2924 if (trans->state == NULL)
2925 (trans + 1)->state = NULL;
2926 /* Use the character ranges, assertions, etc. from `p1' for
2927 the transition from `p1' to `p2'. */
2928 trans->code_min = p1->code_min;
2929 trans->code_max = p1->code_max;
2930 trans->state = transitions + offs[p2->position];
2931 trans->state_id = p2->position;
2932 trans->assertions = p1->assertions | p2->assertions
2933 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2934 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2935 if (p1->backref >= 0)
2937 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2938 assert(p2->backref < 0);
2939 trans->u.backref = p1->backref;
2940 trans->assertions |= ASSERT_BACKREF;
2943 trans->u.class = p1->class;
2944 if (p1->neg_classes != NULL)
2946 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2947 trans->neg_classes =
2948 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2949 if (trans->neg_classes == NULL)
2951 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2952 trans->neg_classes[i] = p1->neg_classes[i];
2953 trans->neg_classes[i] = (tre_ctype_t)0;
2956 trans->neg_classes = NULL;
2958 /* Find out how many tags this transition has. */
2960 if (p1->tags != NULL)
2961 while(p1->tags[i] >= 0)
2964 if (p2->tags != NULL)
2965 while(p2->tags[j] >= 0)
2968 /* If we are overwriting a transition, free the old tag array. */
2969 if (trans->tags != NULL)
2973 /* If there were any tags, allocate an array and fill it. */
2976 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2980 if (p1->tags != NULL)
2981 while(p1->tags[i] >= 0)
2983 trans->tags[i] = p1->tags[i];
2988 if (p2->tags != NULL)
2989 while (p2->tags[j] >= 0)
2991 /* Don't add duplicates. */
2993 for (k = 0; k < i; k++)
2994 if (trans->tags[k] == p2->tags[j])
3000 trans->tags[l++] = p2->tags[j];
3003 trans->tags[l] = -1;
3011 /* Compute a maximum limit for the number of transitions leaving
3013 while (p1->position >= 0)
3016 while (p2->position >= 0)
3018 counts[p1->position]++;
3026 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3027 labelled with one character range (there are no transitions on empty
3028 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3030 static reg_errcode_t
3031 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3032 int *counts, int *offs)
3035 tre_catenation_t *cat;
3036 tre_iteration_t *iter;
3037 reg_errcode_t errcode = REG_OK;
3039 /* XXX - recurse using a stack!. */
3045 uni = (tre_union_t *)node->obj;
3046 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3047 if (errcode != REG_OK)
3049 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3053 cat = (tre_catenation_t *)node->obj;
3054 /* Add a transition from each position in cat->left->lastpos
3055 to each position in cat->right->firstpos. */
3056 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3057 transitions, counts, offs);
3058 if (errcode != REG_OK)
3060 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3061 if (errcode != REG_OK)
3063 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3067 iter = (tre_iteration_t *)node->obj;
3068 assert(iter->max == -1 || iter->max == 1);
3070 if (iter->max == -1)
3072 assert(iter->min == 0 || iter->min == 1);
3073 /* Add a transition from each last position in the iterated
3074 expression to each first position. */
3075 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3076 transitions, counts, offs);
3077 if (errcode != REG_OK)
3080 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3087 #define ERROR_EXIT(err) \
3091 if (/*CONSTCOND*/1) \
3094 while (/*CONSTCOND*/0)
3098 regcomp(regex_t *preg, const char *regex, int cflags)
3101 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3102 tre_pos_and_tags_t *p;
3103 int *counts = NULL, *offs = NULL;
3105 tre_tnfa_transition_t *transitions, *initial;
3106 tre_tnfa_t *tnfa = NULL;
3107 tre_submatch_data_t *submatch_data;
3108 tre_tag_direction_t *tag_directions = NULL;
3109 reg_errcode_t errcode;
3112 /* Parse context. */
3113 tre_parse_ctx_t parse_ctx;
3115 /* Allocate a stack used throughout the compilation process for various
3117 stack = tre_stack_new(512, 10240, 128);
3120 /* Allocate a fast memory allocator. */
3121 mem = tre_mem_new();
3124 tre_stack_destroy(stack);
3128 /* Parse the regexp. */
3129 memset(&parse_ctx, 0, sizeof(parse_ctx));
3130 parse_ctx.mem = mem;
3131 parse_ctx.stack = stack;
3132 parse_ctx.re = regex;
3133 parse_ctx.cflags = cflags;
3134 parse_ctx.max_backref = -1;
3135 errcode = tre_parse(&parse_ctx);
3136 if (errcode != REG_OK)
3137 ERROR_EXIT(errcode);
3138 preg->re_nsub = parse_ctx.submatch_id - 1;
3139 tree = parse_ctx.result;
3141 /* Back references and approximate matching cannot currently be used
3142 in the same regexp. */
3143 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3144 ERROR_EXIT(REG_BADPAT);
3147 tre_ast_print(tree);
3148 #endif /* TRE_DEBUG */
3150 /* Referring to nonexistent subexpressions is illegal. */
3151 if (parse_ctx.max_backref > (int)preg->re_nsub)
3152 ERROR_EXIT(REG_ESUBREG);
3154 /* Allocate the TNFA struct. */
3155 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3157 ERROR_EXIT(REG_ESPACE);
3158 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3159 tnfa->have_approx = parse_ctx.have_approx;
3160 tnfa->num_submatches = parse_ctx.submatch_id;
3162 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3163 regexp does not have back references, this can be skipped. */
3164 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3167 /* Figure out how many tags we will need. */
3168 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3169 if (errcode != REG_OK)
3170 ERROR_EXIT(errcode);
3172 if (tnfa->num_tags > 0)
3174 tag_directions = xmalloc(sizeof(*tag_directions)
3175 * (tnfa->num_tags + 1));
3176 if (tag_directions == NULL)
3177 ERROR_EXIT(REG_ESPACE);
3178 tnfa->tag_directions = tag_directions;
3179 memset(tag_directions, -1,
3180 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3182 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3183 sizeof(tnfa->minimal_tags));
3184 if (tnfa->minimal_tags == NULL)
3185 ERROR_EXIT(REG_ESPACE);
3187 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3188 sizeof(*submatch_data));
3189 if (submatch_data == NULL)
3190 ERROR_EXIT(REG_ESPACE);
3191 tnfa->submatch_data = submatch_data;
3193 errcode = tre_add_tags(mem, stack, tree, tnfa);
3194 if (errcode != REG_OK)
3195 ERROR_EXIT(errcode);
3199 /* Expand iteration nodes. */
3200 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3201 tag_directions, &tnfa->params_depth);
3202 if (errcode != REG_OK)
3203 ERROR_EXIT(errcode);
3205 /* Add a dummy node for the final state.
3206 XXX - For certain patterns this dummy node can be optimized away,
3207 for example "a*" or "ab*". Figure out a simple way to detect
3208 this possibility. */
3210 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3211 if (tmp_ast_r == NULL)
3212 ERROR_EXIT(REG_ESPACE);
3214 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3216 ERROR_EXIT(REG_ESPACE);
3218 errcode = tre_compute_nfl(mem, stack, tree);
3219 if (errcode != REG_OK)
3220 ERROR_EXIT(errcode);
3222 counts = xmalloc(sizeof(int) * parse_ctx.position);
3224 ERROR_EXIT(REG_ESPACE);
3226 offs = xmalloc(sizeof(int) * parse_ctx.position);
3228 ERROR_EXIT(REG_ESPACE);
3230 for (i = 0; i < parse_ctx.position; i++)
3232 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3235 for (i = 0; i < parse_ctx.position; i++)
3238 add += counts[i] + 1;
3241 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3242 if (transitions == NULL)
3243 ERROR_EXIT(REG_ESPACE);
3244 tnfa->transitions = transitions;
3245 tnfa->num_transitions = add;
3247 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3248 if (errcode != REG_OK)
3249 ERROR_EXIT(errcode);
3251 tnfa->firstpos_chars = NULL;
3255 while (p->position >= 0)
3261 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3262 if (initial == NULL)
3263 ERROR_EXIT(REG_ESPACE);
3264 tnfa->initial = initial;
3267 for (p = tree->firstpos; p->position >= 0; p++)
3269 initial[i].state = transitions + offs[p->position];
3270 initial[i].state_id = p->position;
3271 initial[i].tags = NULL;
3272 /* Copy the arrays p->tags, and p->params, they are allocated
3273 from a tre_mem object. */
3277 for (j = 0; p->tags[j] >= 0; j++);
3278 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3279 if (!initial[i].tags)
3280 ERROR_EXIT(REG_ESPACE);
3281 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3283 initial[i].assertions = p->assertions;
3286 initial[i].state = NULL;
3288 tnfa->num_transitions = add;
3289 tnfa->final = transitions + offs[tree->lastpos[0].position];
3290 tnfa->num_states = parse_ctx.position;
3291 tnfa->cflags = cflags;
3293 tre_mem_destroy(mem);
3294 tre_stack_destroy(stack);
3298 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3302 /* Free everything that was allocated and return the error code. */
3303 tre_mem_destroy(mem);
3305 tre_stack_destroy(stack);
3310 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3319 regfree(regex_t *preg)
3323 tre_tnfa_transition_t *trans;
3325 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3329 for (i = 0; i < tnfa->num_transitions; i++)
3330 if (tnfa->transitions[i].state)
3332 if (tnfa->transitions[i].tags)
3333 xfree(tnfa->transitions[i].tags);
3334 if (tnfa->transitions[i].neg_classes)
3335 xfree(tnfa->transitions[i].neg_classes);
3337 if (tnfa->transitions)
3338 xfree(tnfa->transitions);
3342 for (trans = tnfa->initial; trans->state; trans++)
3347 xfree(tnfa->initial);
3350 if (tnfa->submatch_data)
3352 for (i = 0; i < tnfa->num_submatches; i++)
3353 if (tnfa->submatch_data[i].parents)
3354 xfree(tnfa->submatch_data[i].parents);
3355 xfree(tnfa->submatch_data);
3358 if (tnfa->tag_directions)
3359 xfree(tnfa->tag_directions);
3360 if (tnfa->firstpos_chars)
3361 xfree(tnfa->firstpos_chars);
3362 if (tnfa->minimal_tags)
3363 xfree(tnfa->minimal_tags);