2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
967 if (!ctx->nofirstsub)
969 STACK_PUSH(stack, int, ctx->submatch_id);
970 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
973 STACK_PUSH(stack, int, PARSE_RE);
974 ctx->re_start = ctx->re;
977 /* The following is basically just a recursive descent parser. I use
978 an explicit stack instead of recursive functions mostly because of
979 two reasons: compatibility with systems which have an overflowable
980 call stack, and efficiency (both in lines of code and speed). */
981 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
983 if (status != REG_OK)
985 symbol = tre_stack_pop_int(stack);
989 /* Parse a full regexp. A regexp is one or more branches,
990 separated by the union operator `|'. */
991 if (ctx->cflags & REG_EXTENDED)
992 STACK_PUSHX(stack, int, PARSE_UNION);
993 STACK_PUSHX(stack, int, PARSE_BRANCH);
997 /* Parse a branch. A branch is one or more pieces, concatenated.
998 A piece is an atom possibly followed by a postfix operator. */
999 STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 STACK_PUSHX(stack, int, PARSE_PIECE);
1004 /* Parse a piece. A piece is an atom possibly followed by one
1005 or more postfix operators. */
1006 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1007 STACK_PUSHX(stack, int, PARSE_ATOM);
1010 case PARSE_CATENATION:
1011 /* If the expression has not ended, parse another piece. */
1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1019 if ((ctx->cflags & REG_EXTENDED
1020 && c == CHAR_RPAREN && depth > 0)
1021 || (!(ctx->cflags & REG_EXTENDED)
1022 && (c == CHAR_BACKSLASH
1023 && *(ctx->re + 1) == CHAR_RPAREN)))
1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 status = REG_EPAREN;
1028 if (!(ctx->cflags & REG_EXTENDED))
1034 /* Default case, left associative concatenation. */
1035 STACK_PUSHX(stack, int, PARSE_CATENATION);
1036 STACK_PUSHX(stack, voidptr, result);
1037 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1038 STACK_PUSHX(stack, int, PARSE_PIECE);
1043 case PARSE_POST_CATENATION:
1045 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1046 tre_ast_node_t *tmp_node;
1047 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1058 STACK_PUSHX(stack, int, PARSE_UNION);
1059 STACK_PUSHX(stack, voidptr, result);
1060 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1061 STACK_PUSHX(stack, int, PARSE_BRANCH);
1074 case PARSE_POST_UNION:
1076 tre_ast_node_t *tmp_node;
1077 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1078 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1086 /* Parse postfix operators. */
1090 case CHAR_QUESTIONMARK:
1091 if (!(ctx->cflags & REG_EXTENDED))
1096 tre_ast_node_t *tmp_node;
1101 if (*ctx->re == CHAR_PLUS)
1103 if (*ctx->re == CHAR_QUESTIONMARK)
1107 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1109 if (tmp_node == NULL)
1112 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1116 case CHAR_BACKSLASH:
1117 /* "\{" is special without REG_EXTENDED */
1118 if (!(ctx->cflags & REG_EXTENDED)
1119 && *(ctx->re + 1) == CHAR_LBRACE)
1128 /* "{" is literal without REG_EXTENDED */
1129 if (!(ctx->cflags & REG_EXTENDED))
1135 status = tre_parse_bound(ctx, &result);
1136 if (status != REG_OK)
1138 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1144 /* Parse an atom. An atom is a regular expression enclosed in `()',
1145 an empty set of `()', a bracket expression, `.', `^', `$',
1146 a `\' followed by a character, or a single character. */
1150 case CHAR_LPAREN: /* parenthesized subexpression */
1152 if (ctx->cflags & REG_EXTENDED)
1158 /* First parse a whole RE, then mark the resulting tree
1160 STACK_PUSHX(stack, int, ctx->submatch_id);
1161 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1162 STACK_PUSHX(stack, int, PARSE_RE);
1170 case CHAR_LBRACKET: /* bracket expression */
1172 status = tre_parse_bracket(ctx, &result);
1173 if (status != REG_OK)
1177 case CHAR_BACKSLASH:
1178 /* If this is "\(" or "\)" chew off the backslash and
1180 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1185 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
1190 /* If a macro is used, parse the expanded macro recursively. */
1192 const char *buf = tre_expand_macro(ctx->re + 1);
1195 tre_parse_ctx_t subctx;
1196 memcpy(&subctx, ctx, sizeof(subctx));
1198 subctx.nofirstsub = 1;
1199 status = tre_parse(&subctx);
1200 if (status != REG_OK)
1203 ctx->position = subctx.position;
1204 result = subctx.result;
1210 /* Trailing backslash. */
1217 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1222 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1223 ASSERT_AT_WB_NEG, -1);
1227 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1232 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1238 if (ctx->re[0] != CHAR_LBRACE)
1240 /* 8 bit hex char. */
1241 char tmp[3] = {0, 0, 0};
1244 if (tre_isxdigit(ctx->re[0]))
1246 tmp[0] = (char)ctx->re[0];
1249 if (tre_isxdigit(ctx->re[0]))
1251 tmp[1] = (char)ctx->re[0];
1254 val = strtol(tmp, NULL, 16);
1255 result = tre_ast_new_literal(ctx->mem, (int)val,
1256 (int)val, ctx->position);
1267 while (*ctx->re && i < sizeof tmp)
1269 if (ctx->re[0] == CHAR_RBRACE)
1271 if (tre_isxdigit(ctx->re[0]))
1273 tmp[i] = (char)ctx->re[0];
1282 val = strtol(tmp, NULL, 16);
1283 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1291 if (tre_isdigit(*ctx->re))
1293 /* Back reference. */
1294 int val = *ctx->re - '0';
1295 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1300 ctx->max_backref = MAX(val, ctx->max_backref);
1305 /* Escaped character. */
1306 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1317 case CHAR_PERIOD: /* the any-symbol */
1318 if (ctx->cflags & REG_NEWLINE)
1320 tre_ast_node_t *tmp1;
1321 tre_ast_node_t *tmp2;
1322 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1326 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1330 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1337 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1346 case CHAR_CARET: /* beginning of line assertion */
1347 /* '^' has a special meaning everywhere in EREs, and at
1348 beginning of BRE. */
1349 if (ctx->cflags & REG_EXTENDED
1350 || ctx->re == ctx->re_start)
1352 if (!(ctx->cflags & REG_EXTENDED))
1353 STACK_PUSHX(stack, int, PARSE_CATENATION);
1354 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1364 case CHAR_DOLLAR: /* end of line assertion. */
1365 /* '$' is special everywhere in EREs, and in the end of the
1367 if (ctx->cflags & REG_EXTENDED
1370 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1387 case CHAR_QUESTIONMARK:
1388 if (!(ctx->cflags & REG_EXTENDED))
1393 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1401 clen = mbtowc(&wc, ctx->re, -1);
1402 if (clen<0) clen=1, wc=WEOF;
1404 /* Note that we can't use an tre_isalpha() test here, since there
1405 may be characters which are alphabetic but neither upper or
1407 if (ctx->cflags & REG_ICASE
1408 && (tre_isupper(wc) || tre_islower(wc)))
1410 tre_ast_node_t *tmp1;
1411 tre_ast_node_t *tmp2;
1413 /* XXX - Can there be more than one opposite-case
1414 counterpoints for some character in some locale? Or
1415 more than two characters which all should be regarded
1416 the same character if case is ignored? If yes, there
1417 does not seem to be a portable way to detect it. I guess
1418 that at least for multi-character collating elements there
1419 could be several opposite-case counterpoints, but they
1420 cannot be supported portably anyway. */
1421 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1426 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1431 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1437 result = tre_ast_new_literal(ctx->mem, wc, wc,
1448 case PARSE_MARK_FOR_SUBMATCH:
1450 int submatch_id = tre_stack_pop_int(stack);
1452 if (result->submatch_id >= 0)
1454 tre_ast_node_t *n, *tmp_node;
1455 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1458 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1459 if (tmp_node == NULL)
1461 tmp_node->num_submatches = result->num_submatches;
1464 result->submatch_id = submatch_id;
1465 result->num_submatches++;
1469 case PARSE_RESTORE_CFLAGS:
1470 ctx->cflags = tre_stack_pop_int(stack);
1479 /* Check for missing closing parentheses. */
1483 if (status == REG_OK)
1484 ctx->result = result;
1491 /***********************************************************************
1493 ***********************************************************************/
1498 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1503 Algorithms to setup tags so that submatch addressing can be done.
1507 /* Inserts a catenation node to the root of the tree given in `node'.
1508 As the left child a new tag with number `tag_id' to `node' is added,
1509 and the right child is the old root. */
1510 static reg_errcode_t
1511 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1513 tre_catenation_t *c;
1515 c = tre_mem_alloc(mem, sizeof(*c));
1518 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1519 if (c->left == NULL)
1521 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1522 if (c->right == NULL)
1525 c->right->obj = node->obj;
1526 c->right->type = node->type;
1527 c->right->nullable = -1;
1528 c->right->submatch_id = -1;
1529 c->right->firstpos = NULL;
1530 c->right->lastpos = NULL;
1531 c->right->num_tags = 0;
1533 node->type = CATENATION;
1537 /* Inserts a catenation node to the root of the tree given in `node'.
1538 As the right child a new tag with number `tag_id' to `node' is added,
1539 and the left child is the old root. */
1540 static reg_errcode_t
1541 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1543 tre_catenation_t *c;
1545 c = tre_mem_alloc(mem, sizeof(*c));
1548 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1549 if (c->right == NULL)
1551 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1552 if (c->left == NULL)
1555 c->left->obj = node->obj;
1556 c->left->type = node->type;
1557 c->left->nullable = -1;
1558 c->left->submatch_id = -1;
1559 c->left->firstpos = NULL;
1560 c->left->lastpos = NULL;
1561 c->left->num_tags = 0;
1563 node->type = CATENATION;
1569 ADDTAGS_AFTER_ITERATION,
1570 ADDTAGS_AFTER_UNION_LEFT,
1571 ADDTAGS_AFTER_UNION_RIGHT,
1572 ADDTAGS_AFTER_CAT_LEFT,
1573 ADDTAGS_AFTER_CAT_RIGHT,
1574 ADDTAGS_SET_SUBMATCH_END
1575 } tre_addtags_symbol_t;
1584 /* Go through `regset' and set submatch data for submatches that are
1587 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1591 for (i = 0; regset[i] >= 0; i++)
1593 int id = regset[i] / 2;
1594 int start = !(regset[i] % 2);
1596 tnfa->submatch_data[id].so_tag = tag;
1598 tnfa->submatch_data[id].eo_tag = tag;
1604 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1605 subexpressions marked for submatch addressing can be traced. */
1606 static reg_errcode_t
1607 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1610 reg_errcode_t status = REG_OK;
1611 tre_addtags_symbol_t symbol;
1612 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1613 int bottom = tre_stack_num_objects(stack);
1614 /* True for first pass (counting number of needed tags) */
1615 int first_pass = (mem == NULL || tnfa == NULL);
1616 int *regset, *orig_regset;
1617 int num_tags = 0; /* Total number of tags. */
1618 int num_minimals = 0; /* Number of special minimal tags. */
1619 int tag = 0; /* The tag that is to be added next. */
1620 int next_tag = 1; /* Next tag to use after this one. */
1621 int *parents; /* Stack of submatches the current submatch is
1623 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1624 tre_tag_states_t *saved_states;
1626 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1630 tnfa->minimal_tags[0] = -1;
1633 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1637 orig_regset = regset;
1639 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1640 if (parents == NULL)
1647 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1648 if (saved_states == NULL)
1657 for (i = 0; i <= tnfa->num_submatches; i++)
1658 saved_states[i].tag = -1;
1661 STACK_PUSH(stack, voidptr, node);
1662 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1664 while (tre_stack_num_objects(stack) > bottom)
1666 if (status != REG_OK)
1669 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1673 case ADDTAGS_SET_SUBMATCH_END:
1675 int id = tre_stack_pop_int(stack);
1678 /* Add end of this submatch to regset. */
1679 for (i = 0; regset[i] >= 0; i++);
1680 regset[i] = id * 2 + 1;
1683 /* Pop this submatch from the parents stack. */
1684 for (i = 0; parents[i] >= 0; i++);
1685 parents[i - 1] = -1;
1689 case ADDTAGS_RECURSE:
1690 node = tre_stack_pop_voidptr(stack);
1692 if (node->submatch_id >= 0)
1694 int id = node->submatch_id;
1698 /* Add start of this submatch to regset. */
1699 for (i = 0; regset[i] >= 0; i++);
1705 for (i = 0; parents[i] >= 0; i++);
1706 tnfa->submatch_data[id].parents = NULL;
1709 int *p = xmalloc(sizeof(*p) * (i + 1));
1712 status = REG_ESPACE;
1715 assert(tnfa->submatch_data[id].parents == NULL);
1716 tnfa->submatch_data[id].parents = p;
1717 for (i = 0; parents[i] >= 0; i++)
1723 /* Add end of this submatch to regset after processing this
1725 STACK_PUSHX(stack, int, node->submatch_id);
1726 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1733 tre_literal_t *lit = node->obj;
1735 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1740 /* Regset is not empty, so add a tag before the
1741 literal or backref. */
1744 status = tre_add_tag_left(mem, node, tag);
1745 tnfa->tag_directions[tag] = direction;
1746 if (minimal_tag >= 0)
1748 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1749 tnfa->minimal_tags[i] = tag;
1750 tnfa->minimal_tags[i + 1] = minimal_tag;
1751 tnfa->minimal_tags[i + 2] = -1;
1755 tre_purge_regset(regset, tnfa, tag);
1770 assert(!IS_TAG(lit));
1776 tre_catenation_t *cat = node->obj;
1777 tre_ast_node_t *left = cat->left;
1778 tre_ast_node_t *right = cat->right;
1779 int reserved_tag = -1;
1782 /* After processing right child. */
1783 STACK_PUSHX(stack, voidptr, node);
1784 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1786 /* Process right child. */
1787 STACK_PUSHX(stack, voidptr, right);
1788 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1790 /* After processing left child. */
1791 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1792 if (left->num_tags > 0 && right->num_tags > 0)
1794 /* Reserve the next tag to the right child. */
1795 reserved_tag = next_tag;
1798 STACK_PUSHX(stack, int, reserved_tag);
1799 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1801 /* Process left child. */
1802 STACK_PUSHX(stack, voidptr, left);
1803 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1809 tre_iteration_t *iter = node->obj;
1813 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1817 STACK_PUSHX(stack, int, tag);
1818 STACK_PUSHX(stack, int, iter->minimal);
1820 STACK_PUSHX(stack, voidptr, node);
1821 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1823 STACK_PUSHX(stack, voidptr, iter->arg);
1824 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1826 /* Regset is not empty, so add a tag here. */
1827 if (regset[0] >= 0 || iter->minimal)
1832 status = tre_add_tag_left(mem, node, tag);
1834 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1836 tnfa->tag_directions[tag] = direction;
1837 if (minimal_tag >= 0)
1839 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1840 tnfa->minimal_tags[i] = tag;
1841 tnfa->minimal_tags[i + 1] = minimal_tag;
1842 tnfa->minimal_tags[i + 2] = -1;
1846 tre_purge_regset(regset, tnfa, tag);
1854 direction = TRE_TAG_MINIMIZE;
1859 tre_union_t *uni = node->obj;
1860 tre_ast_node_t *left = uni->left;
1861 tre_ast_node_t *right = uni->right;
1867 left_tag = next_tag;
1868 right_tag = next_tag + 1;
1873 right_tag = next_tag;
1876 /* After processing right child. */
1877 STACK_PUSHX(stack, int, right_tag);
1878 STACK_PUSHX(stack, int, left_tag);
1879 STACK_PUSHX(stack, voidptr, regset);
1880 STACK_PUSHX(stack, int, regset[0] >= 0);
1881 STACK_PUSHX(stack, voidptr, node);
1882 STACK_PUSHX(stack, voidptr, right);
1883 STACK_PUSHX(stack, voidptr, left);
1884 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1886 /* Process right child. */
1887 STACK_PUSHX(stack, voidptr, right);
1888 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1890 /* After processing left child. */
1891 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1893 /* Process left child. */
1894 STACK_PUSHX(stack, voidptr, left);
1895 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1897 /* Regset is not empty, so add a tag here. */
1903 status = tre_add_tag_left(mem, node, tag);
1904 tnfa->tag_directions[tag] = direction;
1905 if (minimal_tag >= 0)
1907 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1908 tnfa->minimal_tags[i] = tag;
1909 tnfa->minimal_tags[i + 1] = minimal_tag;
1910 tnfa->minimal_tags[i + 2] = -1;
1914 tre_purge_regset(regset, tnfa, tag);
1923 if (node->num_submatches > 0)
1925 /* The next two tags are reserved for markers. */
1935 if (node->submatch_id >= 0)
1938 /* Push this submatch on the parents stack. */
1939 for (i = 0; parents[i] >= 0; i++);
1940 parents[i] = node->submatch_id;
1941 parents[i + 1] = -1;
1944 break; /* end case: ADDTAGS_RECURSE */
1946 case ADDTAGS_AFTER_ITERATION:
1950 node = tre_stack_pop_voidptr(stack);
1953 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1954 + tre_stack_pop_int(stack);
1959 minimal = tre_stack_pop_int(stack);
1960 enter_tag = tre_stack_pop_int(stack);
1962 minimal_tag = enter_tag;
1968 direction = TRE_TAG_MINIMIZE;
1970 direction = TRE_TAG_MAXIMIZE;
1975 case ADDTAGS_AFTER_CAT_LEFT:
1977 int new_tag = tre_stack_pop_int(stack);
1978 next_tag = tre_stack_pop_int(stack);
1986 case ADDTAGS_AFTER_CAT_RIGHT:
1987 node = tre_stack_pop_voidptr(stack);
1989 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1990 + ((tre_catenation_t *)node->obj)->right->num_tags;
1993 case ADDTAGS_AFTER_UNION_LEFT:
1994 /* Lift the bottom of the `regset' array so that when processing
1995 the right operand the items currently in the array are
1996 invisible. The original bottom was saved at ADDTAGS_UNION and
1997 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1998 while (*regset >= 0)
2002 case ADDTAGS_AFTER_UNION_RIGHT:
2004 int added_tags, tag_left, tag_right;
2005 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2006 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2007 node = tre_stack_pop_voidptr(stack);
2008 added_tags = tre_stack_pop_int(stack);
2011 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2012 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2013 + ((node->num_submatches > 0) ? 2 : 0);
2015 regset = tre_stack_pop_voidptr(stack);
2016 tag_left = tre_stack_pop_int(stack);
2017 tag_right = tre_stack_pop_int(stack);
2019 /* Add tags after both children, the left child gets a smaller
2020 tag than the right child. This guarantees that we prefer
2021 the left child over the right child. */
2022 /* XXX - This is not always necessary (if the children have
2023 tags which must be seen for every match of that child). */
2024 /* XXX - Check if this is the only place where tre_add_tag_right
2025 is used. If so, use tre_add_tag_left (putting the tag before
2026 the child as opposed after the child) and throw away
2027 tre_add_tag_right. */
2028 if (node->num_submatches > 0)
2032 status = tre_add_tag_right(mem, left, tag_left);
2033 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2034 status = tre_add_tag_right(mem, right, tag_right);
2035 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2039 direction = TRE_TAG_MAXIMIZE;
2047 } /* end switch(symbol) */
2048 } /* end while(tre_stack_num_objects(stack) > bottom) */
2051 tre_purge_regset(regset, tnfa, tag);
2053 if (!first_pass && minimal_tag >= 0)
2056 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2057 tnfa->minimal_tags[i] = tag;
2058 tnfa->minimal_tags[i + 1] = minimal_tag;
2059 tnfa->minimal_tags[i + 2] = -1;
2064 assert(tree->num_tags == num_tags);
2065 tnfa->end_tag = num_tags;
2066 tnfa->num_tags = num_tags;
2067 tnfa->num_minimals = num_minimals;
2070 xfree(saved_states);
2077 AST to TNFA compilation routines.
2083 } tre_copyast_symbol_t;
2085 /* Flags for tre_copy_ast(). */
2086 #define COPY_REMOVE_TAGS 1
2087 #define COPY_MAXIMIZE_FIRST_TAG 2
2089 static reg_errcode_t
2090 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2091 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2092 tre_ast_node_t **copy, int *max_pos)
2094 reg_errcode_t status = REG_OK;
2095 int bottom = tre_stack_num_objects(stack);
2098 tre_ast_node_t **result = copy;
2099 tre_copyast_symbol_t symbol;
2101 STACK_PUSH(stack, voidptr, ast);
2102 STACK_PUSH(stack, int, COPY_RECURSE);
2104 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2106 tre_ast_node_t *node;
2107 if (status != REG_OK)
2110 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2113 case COPY_SET_RESULT_PTR:
2114 result = tre_stack_pop_voidptr(stack);
2117 node = tre_stack_pop_voidptr(stack);
2122 tre_literal_t *lit = node->obj;
2123 int pos = lit->position;
2124 int min = lit->code_min;
2125 int max = lit->code_max;
2126 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2128 /* XXX - e.g. [ab] has only one position but two
2129 nodes, so we are creating holes in the state space
2130 here. Not fatal, just wastes memory. */
2134 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2136 /* Change this tag to empty. */
2140 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2143 /* Maximize the first tag. */
2144 tag_directions[max] = TRE_TAG_MAXIMIZE;
2147 *result = tre_ast_new_literal(mem, min, max, pos);
2148 if (*result == NULL)
2149 status = REG_ESPACE;
2157 tre_union_t *uni = node->obj;
2159 *result = tre_ast_new_union(mem, uni->left, uni->right);
2160 if (*result == NULL)
2162 status = REG_ESPACE;
2165 tmp = (*result)->obj;
2166 result = &tmp->left;
2167 STACK_PUSHX(stack, voidptr, uni->right);
2168 STACK_PUSHX(stack, int, COPY_RECURSE);
2169 STACK_PUSHX(stack, voidptr, &tmp->right);
2170 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2171 STACK_PUSHX(stack, voidptr, uni->left);
2172 STACK_PUSHX(stack, int, COPY_RECURSE);
2177 tre_catenation_t *cat = node->obj;
2178 tre_catenation_t *tmp;
2179 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2180 if (*result == NULL)
2182 status = REG_ESPACE;
2185 tmp = (*result)->obj;
2188 result = &tmp->left;
2190 STACK_PUSHX(stack, voidptr, cat->right);
2191 STACK_PUSHX(stack, int, COPY_RECURSE);
2192 STACK_PUSHX(stack, voidptr, &tmp->right);
2193 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2194 STACK_PUSHX(stack, voidptr, cat->left);
2195 STACK_PUSHX(stack, int, COPY_RECURSE);
2200 tre_iteration_t *iter = node->obj;
2201 STACK_PUSHX(stack, voidptr, iter->arg);
2202 STACK_PUSHX(stack, int, COPY_RECURSE);
2203 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2204 iter->max, iter->minimal);
2205 if (*result == NULL)
2207 status = REG_ESPACE;
2210 iter = (*result)->obj;
2211 result = &iter->arg;
2221 *pos_add += num_copied;
2228 } tre_expand_ast_symbol_t;
2230 /* Expands each iteration node that has a finite nonzero minimum or maximum
2231 iteration count to a catenated sequence of copies of the node. */
2232 static reg_errcode_t
2233 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2234 int *position, tre_tag_direction_t *tag_directions,
2237 reg_errcode_t status = REG_OK;
2238 int bottom = tre_stack_num_objects(stack);
2240 int pos_add_total = 0;
2244 STACK_PUSHR(stack, voidptr, ast);
2245 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2246 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2248 tre_ast_node_t *node;
2249 tre_expand_ast_symbol_t symbol;
2251 if (status != REG_OK)
2254 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2255 node = tre_stack_pop_voidptr(stack);
2258 case EXPAND_RECURSE:
2263 tre_literal_t *lit= node->obj;
2264 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2266 lit->position += pos_add;
2267 if (lit->position > max_pos)
2268 max_pos = lit->position;
2274 tre_union_t *uni = node->obj;
2275 STACK_PUSHX(stack, voidptr, uni->right);
2276 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2277 STACK_PUSHX(stack, voidptr, uni->left);
2278 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2283 tre_catenation_t *cat = node->obj;
2284 STACK_PUSHX(stack, voidptr, cat->right);
2285 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2286 STACK_PUSHX(stack, voidptr, cat->left);
2287 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2292 tre_iteration_t *iter = node->obj;
2293 STACK_PUSHX(stack, int, pos_add);
2294 STACK_PUSHX(stack, voidptr, node);
2295 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2296 STACK_PUSHX(stack, voidptr, iter->arg);
2297 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2298 /* If we are going to expand this node at EXPAND_AFTER_ITER
2299 then don't increase the `pos' fields of the nodes now, it
2300 will get done when expanding. */
2301 if (iter->min > 1 || iter->max > 1)
2311 case EXPAND_AFTER_ITER:
2313 tre_iteration_t *iter = node->obj;
2315 pos_add = tre_stack_pop_int(stack);
2316 pos_add_last = pos_add;
2317 if (iter->min > 1 || iter->max > 1)
2319 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2321 int pos_add_save = pos_add;
2323 /* Create a catenated sequence of copies of the node. */
2324 for (j = 0; j < iter->min; j++)
2326 tre_ast_node_t *copy;
2327 /* Remove tags from all but the last copy. */
2328 int flags = ((j + 1 < iter->min)
2330 : COPY_MAXIMIZE_FIRST_TAG);
2331 pos_add_save = pos_add;
2332 status = tre_copy_ast(mem, stack, iter->arg, flags,
2333 &pos_add, tag_directions, ©,
2335 if (status != REG_OK)
2338 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2345 if (iter->max == -1)
2347 /* No upper limit. */
2348 pos_add_save = pos_add;
2349 status = tre_copy_ast(mem, stack, iter->arg, 0,
2350 &pos_add, NULL, &seq2, &max_pos);
2351 if (status != REG_OK)
2353 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2359 for (j = iter->min; j < iter->max; j++)
2361 tre_ast_node_t *tmp, *copy;
2362 pos_add_save = pos_add;
2363 status = tre_copy_ast(mem, stack, iter->arg, 0,
2364 &pos_add, NULL, ©, &max_pos);
2365 if (status != REG_OK)
2368 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2373 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2376 seq2 = tre_ast_new_union(mem, tmp, seq2);
2382 pos_add = pos_add_save;
2385 else if (seq2 != NULL)
2386 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2389 node->obj = seq1->obj;
2390 node->type = seq1->type;
2394 pos_add_total += pos_add - pos_add_last;
2395 if (iter_depth == 0)
2396 pos_add = pos_add_total;
2406 *position += pos_add_total;
2408 /* `max_pos' should never be larger than `*position' if the above
2409 code works, but just an extra safeguard let's make sure
2410 `*position' is set large enough so enough memory will be
2411 allocated for the transition table. */
2412 if (max_pos > *position)
2413 *position = max_pos;
2418 static tre_pos_and_tags_t *
2419 tre_set_empty(tre_mem_t mem)
2421 tre_pos_and_tags_t *new_set;
2423 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2424 if (new_set == NULL)
2427 new_set[0].position = -1;
2428 new_set[0].code_min = -1;
2429 new_set[0].code_max = -1;
2434 static tre_pos_and_tags_t *
2435 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2436 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2438 tre_pos_and_tags_t *new_set;
2440 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2441 if (new_set == NULL)
2444 new_set[0].position = position;
2445 new_set[0].code_min = code_min;
2446 new_set[0].code_max = code_max;
2447 new_set[0].class = class;
2448 new_set[0].neg_classes = neg_classes;
2449 new_set[0].backref = backref;
2450 new_set[1].position = -1;
2451 new_set[1].code_min = -1;
2452 new_set[1].code_max = -1;
2457 static tre_pos_and_tags_t *
2458 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2459 int *tags, int assertions)
2462 tre_pos_and_tags_t *new_set;
2466 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2467 for (s1 = 0; set1[s1].position >= 0; s1++);
2468 for (s2 = 0; set2[s2].position >= 0; s2++);
2469 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2473 for (s1 = 0; set1[s1].position >= 0; s1++)
2475 new_set[s1].position = set1[s1].position;
2476 new_set[s1].code_min = set1[s1].code_min;
2477 new_set[s1].code_max = set1[s1].code_max;
2478 new_set[s1].assertions = set1[s1].assertions | assertions;
2479 new_set[s1].class = set1[s1].class;
2480 new_set[s1].neg_classes = set1[s1].neg_classes;
2481 new_set[s1].backref = set1[s1].backref;
2482 if (set1[s1].tags == NULL && tags == NULL)
2483 new_set[s1].tags = NULL;
2486 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2487 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2488 * (i + num_tags + 1)));
2489 if (new_tags == NULL)
2491 for (j = 0; j < i; j++)
2492 new_tags[j] = set1[s1].tags[j];
2493 for (i = 0; i < num_tags; i++)
2494 new_tags[j + i] = tags[i];
2495 new_tags[j + i] = -1;
2496 new_set[s1].tags = new_tags;
2500 for (s2 = 0; set2[s2].position >= 0; s2++)
2502 new_set[s1 + s2].position = set2[s2].position;
2503 new_set[s1 + s2].code_min = set2[s2].code_min;
2504 new_set[s1 + s2].code_max = set2[s2].code_max;
2505 /* XXX - why not | assertions here as well? */
2506 new_set[s1 + s2].assertions = set2[s2].assertions;
2507 new_set[s1 + s2].class = set2[s2].class;
2508 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2509 new_set[s1 + s2].backref = set2[s2].backref;
2510 if (set2[s2].tags == NULL)
2511 new_set[s1 + s2].tags = NULL;
2514 for (i = 0; set2[s2].tags[i] >= 0; i++);
2515 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2516 if (new_tags == NULL)
2518 for (j = 0; j < i; j++)
2519 new_tags[j] = set2[s2].tags[j];
2521 new_set[s1 + s2].tags = new_tags;
2524 new_set[s1 + s2].position = -1;
2528 /* Finds the empty path through `node' which is the one that should be
2529 taken according to POSIX.2 rules, and adds the tags on that path to
2530 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2531 set to the number of tags seen on the path. */
2532 static reg_errcode_t
2533 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2534 int *assertions, int *params, int *num_tags_seen,
2539 tre_catenation_t *cat;
2540 tre_iteration_t *iter;
2542 int bottom = tre_stack_num_objects(stack);
2543 reg_errcode_t status = REG_OK;
2547 status = tre_stack_push_voidptr(stack, node);
2549 /* Walk through the tree recursively. */
2550 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2552 node = tre_stack_pop_voidptr(stack);
2557 lit = (tre_literal_t *)node->obj;
2558 switch (lit->code_min)
2561 if (lit->code_max >= 0)
2565 /* Add the tag to `tags'. */
2566 for (i = 0; tags[i] >= 0; i++)
2567 if (tags[i] == lit->code_max)
2571 tags[i] = lit->code_max;
2580 assert(lit->code_max >= 1
2581 || lit->code_max <= ASSERT_LAST);
2582 if (assertions != NULL)
2583 *assertions |= lit->code_max;
2594 /* Subexpressions starting earlier take priority over ones
2595 starting later, so we prefer the left subexpression over the
2596 right subexpression. */
2597 uni = (tre_union_t *)node->obj;
2598 if (uni->left->nullable)
2599 STACK_PUSHX(stack, voidptr, uni->left)
2600 else if (uni->right->nullable)
2601 STACK_PUSHX(stack, voidptr, uni->right)
2607 /* The path must go through both children. */
2608 cat = (tre_catenation_t *)node->obj;
2609 assert(cat->left->nullable);
2610 assert(cat->right->nullable);
2611 STACK_PUSHX(stack, voidptr, cat->left);
2612 STACK_PUSHX(stack, voidptr, cat->right);
2616 /* A match with an empty string is preferred over no match at
2617 all, so we go through the argument if possible. */
2618 iter = (tre_iteration_t *)node->obj;
2619 if (iter->arg->nullable)
2620 STACK_PUSHX(stack, voidptr, iter->arg);
2636 NFL_POST_CATENATION,
2638 } tre_nfl_stack_symbol_t;
2641 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2642 the nodes of the AST `tree'. */
2643 static reg_errcode_t
2644 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2646 int bottom = tre_stack_num_objects(stack);
2648 STACK_PUSHR(stack, voidptr, tree);
2649 STACK_PUSHR(stack, int, NFL_RECURSE);
2651 while (tre_stack_num_objects(stack) > bottom)
2653 tre_nfl_stack_symbol_t symbol;
2654 tre_ast_node_t *node;
2656 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2657 node = tre_stack_pop_voidptr(stack);
2665 tre_literal_t *lit = (tre_literal_t *)node->obj;
2666 if (IS_BACKREF(lit))
2668 /* Back references: nullable = false, firstpos = {i},
2671 node->firstpos = tre_set_one(mem, lit->position, 0,
2672 TRE_CHAR_MAX, 0, NULL, -1);
2673 if (!node->firstpos)
2675 node->lastpos = tre_set_one(mem, lit->position, 0,
2676 TRE_CHAR_MAX, 0, NULL,
2677 (int)lit->code_max);
2681 else if (lit->code_min < 0)
2683 /* Tags, empty strings, params, and zero width assertions:
2684 nullable = true, firstpos = {}, and lastpos = {}. */
2686 node->firstpos = tre_set_empty(mem);
2687 if (!node->firstpos)
2689 node->lastpos = tre_set_empty(mem);
2695 /* Literal at position i: nullable = false, firstpos = {i},
2699 tre_set_one(mem, lit->position, (int)lit->code_min,
2700 (int)lit->code_max, 0, NULL, -1);
2701 if (!node->firstpos)
2703 node->lastpos = tre_set_one(mem, lit->position,
2706 lit->u.class, lit->neg_classes,
2715 /* Compute the attributes for the two subtrees, and after that
2717 STACK_PUSHR(stack, voidptr, node);
2718 STACK_PUSHR(stack, int, NFL_POST_UNION);
2719 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2720 STACK_PUSHR(stack, int, NFL_RECURSE);
2721 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2722 STACK_PUSHR(stack, int, NFL_RECURSE);
2726 /* Compute the attributes for the two subtrees, and after that
2728 STACK_PUSHR(stack, voidptr, node);
2729 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2730 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2731 STACK_PUSHR(stack, int, NFL_RECURSE);
2732 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2733 STACK_PUSHR(stack, int, NFL_RECURSE);
2737 /* Compute the attributes for the subtree, and after that for
2739 STACK_PUSHR(stack, voidptr, node);
2740 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2741 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2742 STACK_PUSHR(stack, int, NFL_RECURSE);
2745 break; /* end case: NFL_RECURSE */
2747 case NFL_POST_UNION:
2749 tre_union_t *uni = (tre_union_t *)node->obj;
2750 node->nullable = uni->left->nullable || uni->right->nullable;
2751 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2752 uni->right->firstpos, NULL, 0);
2753 if (!node->firstpos)
2755 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2756 uni->right->lastpos, NULL, 0);
2762 case NFL_POST_ITERATION:
2764 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2766 if (iter->min == 0 || iter->arg->nullable)
2770 node->firstpos = iter->arg->firstpos;
2771 node->lastpos = iter->arg->lastpos;
2775 case NFL_POST_CATENATION:
2777 int num_tags, *tags, assertions, params_seen;
2779 reg_errcode_t status;
2780 tre_catenation_t *cat = node->obj;
2781 node->nullable = cat->left->nullable && cat->right->nullable;
2783 /* Compute firstpos. */
2784 if (cat->left->nullable)
2786 /* The left side matches the empty string. Make a first pass
2787 with tre_match_empty() to get the number of tags and
2789 status = tre_match_empty(stack, cat->left,
2790 NULL, NULL, NULL, &num_tags,
2792 if (status != REG_OK)
2794 /* Allocate arrays for the tags and parameters. */
2795 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2800 /* Second pass with tre_mach_empty() to get the list of
2801 tags and parameters. */
2802 status = tre_match_empty(stack, cat->left, tags,
2803 &assertions, params, NULL, NULL);
2804 if (status != REG_OK)
2810 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2813 if (!node->firstpos)
2818 node->firstpos = cat->left->firstpos;
2821 /* Compute lastpos. */
2822 if (cat->right->nullable)
2824 /* The right side matches the empty string. Make a first pass
2825 with tre_match_empty() to get the number of tags and
2827 status = tre_match_empty(stack, cat->right,
2828 NULL, NULL, NULL, &num_tags,
2830 if (status != REG_OK)
2832 /* Allocate arrays for the tags and parameters. */
2833 tags = xmalloc(sizeof(int) * (num_tags + 1));
2838 /* Second pass with tre_mach_empty() to get the list of
2839 tags and parameters. */
2840 status = tre_match_empty(stack, cat->right, tags,
2841 &assertions, params, NULL, NULL);
2842 if (status != REG_OK)
2848 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2856 node->lastpos = cat->right->lastpos;
2871 /* Adds a transition from each position in `p1' to each position in `p2'. */
2872 static reg_errcode_t
2873 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2874 tre_tnfa_transition_t *transitions,
2875 int *counts, int *offs)
2877 tre_pos_and_tags_t *orig_p2 = p2;
2878 tre_tnfa_transition_t *trans;
2879 int i, j, k, l, dup, prev_p2_pos;
2881 if (transitions != NULL)
2882 while (p1->position >= 0)
2886 while (p2->position >= 0)
2888 /* Optimization: if this position was already handled, skip it. */
2889 if (p2->position == prev_p2_pos)
2894 prev_p2_pos = p2->position;
2895 /* Set `trans' to point to the next unused transition from
2896 position `p1->position'. */
2897 trans = transitions + offs[p1->position];
2898 while (trans->state != NULL)
2901 /* If we find a previous transition from `p1->position' to
2902 `p2->position', it is overwritten. This can happen only
2903 if there are nested loops in the regexp, like in "((a)*)*".
2904 In POSIX.2 repetition using the outer loop is always
2905 preferred over using the inner loop. Therefore the
2906 transition for the inner loop is useless and can be thrown
2908 /* XXX - The same position is used for all nodes in a bracket
2909 expression, so this optimization cannot be used (it will
2910 break bracket expressions) unless I figure out a way to
2912 if (trans->state_id == p2->position)
2920 if (trans->state == NULL)
2921 (trans + 1)->state = NULL;
2922 /* Use the character ranges, assertions, etc. from `p1' for
2923 the transition from `p1' to `p2'. */
2924 trans->code_min = p1->code_min;
2925 trans->code_max = p1->code_max;
2926 trans->state = transitions + offs[p2->position];
2927 trans->state_id = p2->position;
2928 trans->assertions = p1->assertions | p2->assertions
2929 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2930 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2931 if (p1->backref >= 0)
2933 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2934 assert(p2->backref < 0);
2935 trans->u.backref = p1->backref;
2936 trans->assertions |= ASSERT_BACKREF;
2939 trans->u.class = p1->class;
2940 if (p1->neg_classes != NULL)
2942 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2943 trans->neg_classes =
2944 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2945 if (trans->neg_classes == NULL)
2947 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2948 trans->neg_classes[i] = p1->neg_classes[i];
2949 trans->neg_classes[i] = (tre_ctype_t)0;
2952 trans->neg_classes = NULL;
2954 /* Find out how many tags this transition has. */
2956 if (p1->tags != NULL)
2957 while(p1->tags[i] >= 0)
2960 if (p2->tags != NULL)
2961 while(p2->tags[j] >= 0)
2964 /* If we are overwriting a transition, free the old tag array. */
2965 if (trans->tags != NULL)
2969 /* If there were any tags, allocate an array and fill it. */
2972 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2976 if (p1->tags != NULL)
2977 while(p1->tags[i] >= 0)
2979 trans->tags[i] = p1->tags[i];
2984 if (p2->tags != NULL)
2985 while (p2->tags[j] >= 0)
2987 /* Don't add duplicates. */
2989 for (k = 0; k < i; k++)
2990 if (trans->tags[k] == p2->tags[j])
2996 trans->tags[l++] = p2->tags[j];
2999 trans->tags[l] = -1;
3007 /* Compute a maximum limit for the number of transitions leaving
3009 while (p1->position >= 0)
3012 while (p2->position >= 0)
3014 counts[p1->position]++;
3022 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3023 labelled with one character range (there are no transitions on empty
3024 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3026 static reg_errcode_t
3027 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3028 int *counts, int *offs)
3031 tre_catenation_t *cat;
3032 tre_iteration_t *iter;
3033 reg_errcode_t errcode = REG_OK;
3035 /* XXX - recurse using a stack!. */
3041 uni = (tre_union_t *)node->obj;
3042 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3043 if (errcode != REG_OK)
3045 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3049 cat = (tre_catenation_t *)node->obj;
3050 /* Add a transition from each position in cat->left->lastpos
3051 to each position in cat->right->firstpos. */
3052 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3053 transitions, counts, offs);
3054 if (errcode != REG_OK)
3056 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3057 if (errcode != REG_OK)
3059 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3063 iter = (tre_iteration_t *)node->obj;
3064 assert(iter->max == -1 || iter->max == 1);
3066 if (iter->max == -1)
3068 assert(iter->min == 0 || iter->min == 1);
3069 /* Add a transition from each last position in the iterated
3070 expression to each first position. */
3071 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3072 transitions, counts, offs);
3073 if (errcode != REG_OK)
3076 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3083 #define ERROR_EXIT(err) \
3087 if (/*CONSTCOND*/1) \
3090 while (/*CONSTCOND*/0)
3094 regcomp(regex_t *preg, const char *regex, int cflags)
3097 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3098 tre_pos_and_tags_t *p;
3099 int *counts = NULL, *offs = NULL;
3101 tre_tnfa_transition_t *transitions, *initial;
3102 tre_tnfa_t *tnfa = NULL;
3103 tre_submatch_data_t *submatch_data;
3104 tre_tag_direction_t *tag_directions = NULL;
3105 reg_errcode_t errcode;
3108 /* Parse context. */
3109 tre_parse_ctx_t parse_ctx;
3111 /* Allocate a stack used throughout the compilation process for various
3113 stack = tre_stack_new(512, 10240, 128);
3116 /* Allocate a fast memory allocator. */
3117 mem = tre_mem_new();
3120 tre_stack_destroy(stack);
3124 /* Parse the regexp. */
3125 memset(&parse_ctx, 0, sizeof(parse_ctx));
3126 parse_ctx.mem = mem;
3127 parse_ctx.stack = stack;
3128 parse_ctx.re = regex;
3129 parse_ctx.cflags = cflags;
3130 parse_ctx.max_backref = -1;
3131 errcode = tre_parse(&parse_ctx);
3132 if (errcode != REG_OK)
3133 ERROR_EXIT(errcode);
3134 preg->re_nsub = parse_ctx.submatch_id - 1;
3135 tree = parse_ctx.result;
3137 /* Back references and approximate matching cannot currently be used
3138 in the same regexp. */
3139 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3140 ERROR_EXIT(REG_BADPAT);
3143 tre_ast_print(tree);
3144 #endif /* TRE_DEBUG */
3146 /* Referring to nonexistent subexpressions is illegal. */
3147 if (parse_ctx.max_backref > (int)preg->re_nsub)
3148 ERROR_EXIT(REG_ESUBREG);
3150 /* Allocate the TNFA struct. */
3151 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3153 ERROR_EXIT(REG_ESPACE);
3154 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3155 tnfa->have_approx = parse_ctx.have_approx;
3156 tnfa->num_submatches = parse_ctx.submatch_id;
3158 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3159 regexp does not have back references, this can be skipped. */
3160 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3163 /* Figure out how many tags we will need. */
3164 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3165 if (errcode != REG_OK)
3166 ERROR_EXIT(errcode);
3168 if (tnfa->num_tags > 0)
3170 tag_directions = xmalloc(sizeof(*tag_directions)
3171 * (tnfa->num_tags + 1));
3172 if (tag_directions == NULL)
3173 ERROR_EXIT(REG_ESPACE);
3174 tnfa->tag_directions = tag_directions;
3175 memset(tag_directions, -1,
3176 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3178 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3179 sizeof(tnfa->minimal_tags));
3180 if (tnfa->minimal_tags == NULL)
3181 ERROR_EXIT(REG_ESPACE);
3183 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3184 sizeof(*submatch_data));
3185 if (submatch_data == NULL)
3186 ERROR_EXIT(REG_ESPACE);
3187 tnfa->submatch_data = submatch_data;
3189 errcode = tre_add_tags(mem, stack, tree, tnfa);
3190 if (errcode != REG_OK)
3191 ERROR_EXIT(errcode);
3195 /* Expand iteration nodes. */
3196 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3197 tag_directions, &tnfa->params_depth);
3198 if (errcode != REG_OK)
3199 ERROR_EXIT(errcode);
3201 /* Add a dummy node for the final state.
3202 XXX - For certain patterns this dummy node can be optimized away,
3203 for example "a*" or "ab*". Figure out a simple way to detect
3204 this possibility. */
3206 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3207 if (tmp_ast_r == NULL)
3208 ERROR_EXIT(REG_ESPACE);
3210 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3212 ERROR_EXIT(REG_ESPACE);
3214 errcode = tre_compute_nfl(mem, stack, tree);
3215 if (errcode != REG_OK)
3216 ERROR_EXIT(errcode);
3218 counts = xmalloc(sizeof(int) * parse_ctx.position);
3220 ERROR_EXIT(REG_ESPACE);
3222 offs = xmalloc(sizeof(int) * parse_ctx.position);
3224 ERROR_EXIT(REG_ESPACE);
3226 for (i = 0; i < parse_ctx.position; i++)
3228 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3231 for (i = 0; i < parse_ctx.position; i++)
3234 add += counts[i] + 1;
3237 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3238 if (transitions == NULL)
3239 ERROR_EXIT(REG_ESPACE);
3240 tnfa->transitions = transitions;
3241 tnfa->num_transitions = add;
3243 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3244 if (errcode != REG_OK)
3245 ERROR_EXIT(errcode);
3247 tnfa->firstpos_chars = NULL;
3251 while (p->position >= 0)
3257 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3258 if (initial == NULL)
3259 ERROR_EXIT(REG_ESPACE);
3260 tnfa->initial = initial;
3263 for (p = tree->firstpos; p->position >= 0; p++)
3265 initial[i].state = transitions + offs[p->position];
3266 initial[i].state_id = p->position;
3267 initial[i].tags = NULL;
3268 /* Copy the arrays p->tags, and p->params, they are allocated
3269 from a tre_mem object. */
3273 for (j = 0; p->tags[j] >= 0; j++);
3274 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3275 if (!initial[i].tags)
3276 ERROR_EXIT(REG_ESPACE);
3277 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3279 initial[i].assertions = p->assertions;
3282 initial[i].state = NULL;
3284 tnfa->num_transitions = add;
3285 tnfa->final = transitions + offs[tree->lastpos[0].position];
3286 tnfa->num_states = parse_ctx.position;
3287 tnfa->cflags = cflags;
3289 tre_mem_destroy(mem);
3290 tre_stack_destroy(stack);
3294 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3298 /* Free everything that was allocated and return the error code. */
3299 tre_mem_destroy(mem);
3301 tre_stack_destroy(stack);
3306 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3315 regfree(regex_t *preg)
3319 tre_tnfa_transition_t *trans;
3321 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3325 for (i = 0; i < tnfa->num_transitions; i++)
3326 if (tnfa->transitions[i].state)
3328 if (tnfa->transitions[i].tags)
3329 xfree(tnfa->transitions[i].tags);
3330 if (tnfa->transitions[i].neg_classes)
3331 xfree(tnfa->transitions[i].neg_classes);
3333 if (tnfa->transitions)
3334 xfree(tnfa->transitions);
3338 for (trans = tnfa->initial; trans->state; trans++)
3343 xfree(tnfa->initial);
3346 if (tnfa->submatch_data)
3348 for (i = 0; i < tnfa->num_submatches; i++)
3349 if (tnfa->submatch_data[i].parents)
3350 xfree(tnfa->submatch_data[i].parents);
3351 xfree(tnfa->submatch_data);
3354 if (tnfa->tag_directions)
3355 xfree(tnfa->tag_directions);
3356 if (tnfa->firstpos_chars)
3357 xfree(tnfa->firstpos_chars);
3358 if (tnfa->minimal_tags)
3359 xfree(tnfa->minimal_tags);