2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
33 /***********************************************************************
34 from tre-ast.c and tre-ast.h
35 ***********************************************************************/
37 /* The different AST node types. */
45 /* Special subtypes of TRE_LITERAL. */
46 #define EMPTY -1 /* Empty leaf (denotes empty string). */
47 #define ASSERTION -2 /* Assertion leaf. */
48 #define TAG -3 /* Tag leaf. */
49 #define BACKREF -4 /* Back reference leaf. */
51 #define IS_SPECIAL(x) ((x)->code_min < 0)
52 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
53 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
54 #define IS_TAG(x) ((x)->code_min == TAG)
55 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
57 /* Taken from tre-compile.h */
65 tre_ctype_t *neg_classes;
69 /* A generic AST node. All AST nodes consist of this node on the top
70 level with `obj' pointing to the actual content. */
72 tre_ast_type_t type; /* Type of the node. */
73 void *obj; /* Pointer to actual node. */
78 tre_pos_and_tags_t *firstpos;
79 tre_pos_and_tags_t *lastpos;
83 /* A "literal" node. These are created for assertions, back references,
84 tags, matching parameter settings, and all expressions that match one
91 tre_ctype_t *neg_classes;
94 /* A "catenation" node. These are created when two regexps are concatenated.
95 If there are more than one subexpressions in sequence, the `left' part
96 holds all but the last, and `right' part holds the last subexpression
97 (catenation is left associative). */
100 tre_ast_node_t *right;
103 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
106 /* Subexpression to match. */
108 /* Minimum number of consecutive matches. */
110 /* Maximum number of consecutive matches. */
114 /* An "union" node. These are created for the "|" operator. */
116 tre_ast_node_t *left;
117 tre_ast_node_t *right;
120 static tre_ast_node_t *
121 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
123 tre_ast_node_t *node;
125 node = tre_mem_calloc(mem, sizeof(*node));
128 node->obj = tre_mem_calloc(mem, size);
133 node->submatch_id = -1;
138 static tre_ast_node_t *
139 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
141 tre_ast_node_t *node;
144 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
148 lit->code_min = code_min;
149 lit->code_max = code_max;
150 lit->position = position;
155 static tre_ast_node_t *
156 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max)
158 tre_ast_node_t *node;
159 tre_iteration_t *iter;
161 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
168 node->num_submatches = arg->num_submatches;
173 static tre_ast_node_t *
174 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
176 tre_ast_node_t *node;
178 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
181 ((tre_union_t *)node->obj)->left = left;
182 ((tre_union_t *)node->obj)->right = right;
183 node->num_submatches = left->num_submatches + right->num_submatches;
188 static tre_ast_node_t *
189 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
190 tre_ast_node_t *right)
192 tre_ast_node_t *node;
194 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
197 ((tre_catenation_t *)node->obj)->left = left;
198 ((tre_catenation_t *)node->obj)->right = right;
199 node->num_submatches = left->num_submatches + right->num_submatches;
204 /***********************************************************************
205 from tre-stack.c and tre-stack.h
206 ***********************************************************************/
208 /* Just to save some typing. */
209 #define STACK_PUSH(s, value) \
212 status = tre_stack_push(s, (void *)(value)); \
216 #define STACK_PUSHX(s, value) \
218 status = tre_stack_push(s, (void *)(value)); \
219 if (status != REG_OK) \
223 #define STACK_PUSHR(s, value) \
225 reg_errcode_t status; \
226 status = tre_stack_push(s, (void *)(value)); \
227 if (status != REG_OK) \
231 typedef struct tre_stack_rec {
241 tre_stack_new(int size, int max_size, int increment)
245 s = xmalloc(sizeof(*s));
248 s->stack = xmalloc(sizeof(*s->stack) * size);
249 if (s->stack == NULL)
255 s->max_size = max_size;
256 s->increment = increment;
263 tre_stack_destroy(tre_stack_t *s)
270 tre_stack_num_objects(tre_stack_t *s)
276 tre_stack_push(tre_stack_t *s, void *value)
278 if (s->ptr < s->size)
280 s->stack[s->ptr] = value;
285 if (s->size >= s->max_size)
287 DPRINT(("tre_stack_push: stack full\n"));
294 DPRINT(("tre_stack_push: trying to realloc more space\n"));
295 new_size = s->size + s->increment;
296 if (new_size > s->max_size)
297 new_size = s->max_size;
298 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
299 if (new_buffer == NULL)
301 DPRINT(("tre_stack_push: realloc failed.\n"));
304 DPRINT(("tre_stack_push: realloc succeeded.\n"));
305 assert(new_size > s->size);
307 s->stack = new_buffer;
308 tre_stack_push(s, value);
315 tre_stack_pop(tre_stack_t *s)
317 return s->stack[--s->ptr];
321 /***********************************************************************
322 from tre-parse.c and tre-parse.h
323 ***********************************************************************/
327 /* Memory allocator. The AST is allocated using this. */
329 /* Stack used for keeping track of regexp syntax. */
331 /* The parse result. */
332 tre_ast_node_t *result;
333 /* The regexp to parse and its length. */
334 const tre_char_t *re;
335 /* The first character of the entire regexp. */
336 const tre_char_t *re_start;
337 /* The first character after the end of the regexp. */
338 const tre_char_t *re_end;
340 /* Current submatch ID. */
342 /* Current position (number of literal). */
344 /* The highest back reference or -1 if none seen so far. */
346 /* Compilation flags. */
348 /* If this flag is set the top-level submatch is not captured. */
353 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
354 tre_ast_node_t ***items)
356 reg_errcode_t status;
357 tre_ast_node_t **array = *items;
358 /* Allocate more space if necessary. */
361 tre_ast_node_t **new_items;
362 DPRINT(("out of array space, i = %d\n", *i));
363 /* If the array is already 1024 items large, give up -- there's
364 probably an error in the regexp (e.g. not a '\0' terminated
365 string and missing ']') */
369 new_items = xrealloc(array, sizeof(*items) * *max_i);
370 if (new_items == NULL)
372 *items = array = new_items;
374 array[*i] = tre_ast_new_literal(mem, min, max, -1);
375 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
381 /* Expands a character class to character ranges. */
383 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
384 int *i, int *max_i, int cflags)
386 reg_errcode_t status = REG_OK;
388 int j, min = -1, max = 0;
389 assert(TRE_MB_CUR_MAX == 1);
391 DPRINT((" expanding class to character ranges\n"));
392 for (j = 0; (j < 256) && (status == REG_OK); j++)
395 if (tre_isctype(c, class)
396 || ((cflags & REG_ICASE)
397 && (tre_isctype(tre_tolower(c), class)
398 || tre_isctype(tre_toupper(c), class))))
406 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
407 status = tre_new_item(mem, min, max, i, max_i, items);
411 if (min >= 0 && status == REG_OK)
412 status = tre_new_item(mem, min, max, i, max_i, items);
418 tre_compare_items(const void *a, const void *b)
420 tre_ast_node_t *node_a = *(tre_ast_node_t **)a;
421 tre_ast_node_t *node_b = *(tre_ast_node_t **)b;
422 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
423 int a_min = l_a->code_min, b_min = l_b->code_min;
427 else if (a_min > b_min)
433 /* Maximum number of character classes that can occur in a negated bracket
435 #define MAX_NEG_CLASSES 64
437 /* Maximum length of character class names. */
438 #define MAX_CLASS_NAME
441 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
442 tre_ctype_t neg_classes[], int *num_neg_classes,
443 tre_ast_node_t ***items, int *num_items,
446 const tre_char_t *re = ctx->re;
447 reg_errcode_t status = REG_OK;
448 tre_ctype_t class = (tre_ctype_t)0;
450 int max_i = *items_size;
453 /* Build an array of the items in the bracket expression. */
454 while (status == REG_OK)
457 if (re == ctx->re_end)
461 else if (*re == ']' && re > ctx->re)
463 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
464 ctx->re_end - re, re));
470 tre_cint_t min = 0, max = 0;
472 class = (tre_ctype_t)0;
473 if (re + 2 < ctx->re_end
474 && *(re + 1) == '-' && *(re + 2) != ']')
476 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n",
477 ctx->re_end - re, re));
481 /* XXX - Should use collation order instead of encoding values
482 in character ranges. */
486 else if (re + 1 < ctx->re_end
487 && *re == '[' && *(re + 1) == '.')
488 status = REG_ECOLLATE;
489 else if (re + 1 < ctx->re_end
490 && *re == '[' && *(re + 1) == '=')
491 status = REG_ECOLLATE;
492 else if (re + 1 < ctx->re_end
493 && *re == '[' && *(re + 1) == ':')
496 const tre_char_t *endptr = re + 2;
498 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n",
499 ctx->re_end - re, re));
500 while (endptr < ctx->re_end && *endptr != ':')
502 if (endptr != ctx->re_end)
504 len = MIN(endptr - re - 2, 63);
507 tre_char_t tmp_wcs[64];
508 wcsncpy(tmp_wcs, re + 2, len);
510 #if defined HAVE_WCSRTOMBS
513 const tre_char_t *src = tmp_wcs;
514 memset(&state, '\0', sizeof(state));
515 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
517 #elif defined HAVE_WCSTOMBS
518 len = wcstombs(tmp_str, tmp_wcs, 63);
519 #endif /* defined HAVE_WCSTOMBS */
521 #else /* !TRE_WCHAR */
522 strncpy(tmp_str, re + 2, len);
523 #endif /* !TRE_WCHAR */
525 DPRINT((" class name: %s\n", tmp_str));
526 class = tre_ctype(tmp_str);
529 /* Optimize character classes for 8 bit character sets. */
530 if (status == REG_OK && TRE_MB_CUR_MAX == 1)
532 status = tre_expand_ctype(ctx->mem, class, items,
533 &i, &max_i, ctx->cflags);
534 class = (tre_ctype_t)0;
546 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n",
547 ctx->re_end - re, re));
548 if (*re == '-' && *(re + 1) != ']'
550 /* Two ranges are not allowed to share and endpoint. */
555 if (status != REG_OK)
559 if (*num_neg_classes >= MAX_NEG_CLASSES)
562 neg_classes[(*num_neg_classes)++] = class;
565 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
566 if (status != REG_OK)
568 ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
571 /* Add opposite-case counterpoints if REG_ICASE is present.
572 This is broken if there are more than two "same" characters. */
573 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
577 DPRINT(("adding opposite-case counterpoints\n"));
580 if (tre_islower(min))
582 cmin = ccurr = tre_toupper(min++);
583 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
585 ccurr = tre_toupper(min++);
586 status = tre_new_item(ctx->mem, cmin, ccurr,
589 else if (tre_isupper(min))
591 cmin = ccurr = tre_tolower(min++);
592 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
594 ccurr = tre_tolower(min++);
595 status = tre_new_item(ctx->mem, cmin, ccurr,
599 if (status != REG_OK)
602 if (status != REG_OK)
614 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
616 tre_ast_node_t *node = NULL;
618 reg_errcode_t status = REG_OK;
619 tre_ast_node_t **items, *u, *n;
620 int i = 0, j, max_i = 32, curr_max, curr_min;
621 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
622 int num_neg_classes = 0;
624 /* Start off with an array of `max_i' elements. */
625 items = xmalloc(sizeof(*items) * max_i);
631 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n",
632 ctx->re_end - ctx->re, ctx->re));
637 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
640 if (status != REG_OK)
641 goto parse_bracket_done;
643 /* Sort the array if we need to negate it. */
645 qsort(items, i, sizeof(*items), tre_compare_items);
647 curr_max = curr_min = 0;
648 /* Build a union of the items in the array, negated if necessary. */
649 for (j = 0; j < i && status == REG_OK; j++)
652 tre_literal_t *l = items[j]->obj;
656 DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
657 (int)l->code_min, (int)l->code_max, (long)l->class, curr_max));
664 curr_max = MAX(max + 1, curr_max);
665 DPRINT(("overlap, curr_max = %d\n", curr_max));
672 if (curr_max >= curr_min)
674 DPRINT(("no overlap\n"));
675 l->code_min = curr_min;
676 l->code_max = curr_max;
680 DPRINT(("no overlap, zero room\n"));
683 curr_min = curr_max = max + 1;
690 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
691 l->position = ctx->position;
692 if (num_neg_classes > 0)
694 l->neg_classes = tre_mem_alloc(ctx->mem,
695 (sizeof(l->neg_classes)
696 * (num_neg_classes + 1)));
697 if (l->neg_classes == NULL)
702 for (k = 0; k < num_neg_classes; k++)
703 l->neg_classes[k] = neg_classes[k];
704 l->neg_classes[k] = (tre_ctype_t)0;
707 l->neg_classes = NULL;
712 u = tre_ast_new_union(ctx->mem, node, items[j]);
720 if (status != REG_OK)
721 goto parse_bracket_done;
726 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
727 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
732 tre_literal_t *l = n->obj;
733 if (num_neg_classes > 0)
735 l->neg_classes = tre_mem_alloc(ctx->mem,
736 (sizeof(l->neg_classes)
737 * (num_neg_classes + 1)));
738 if (l->neg_classes == NULL)
741 goto parse_bracket_done;
743 for (k = 0; k < num_neg_classes; k++)
744 l->neg_classes[k] = neg_classes[k];
745 l->neg_classes[k] = (tre_ctype_t)0;
748 l->neg_classes = NULL;
753 u = tre_ast_new_union(ctx->mem, node, n);
761 if (status != REG_OK)
762 goto parse_bracket_done;
766 #endif /* TRE_DEBUG */
776 /* Parses a positive decimal integer. Returns -1 if the string does not
777 contain a valid number. */
779 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
782 const tre_char_t *r = *regex;
783 while (r < regex_end && *r >= '0' && *r <= '9')
787 num = num * 10 + *r - '0';
796 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
799 const tre_char_t *r = ctx->re;
800 const tre_char_t *start;
803 /* Parse number (minimum repetition count). */
805 if (r < ctx->re_end && *r >= '0' && *r <= '9') {
806 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r));
807 min = tre_parse_int(&r, ctx->re_end);
810 /* Parse comma and second number (maximum repetition count). */
812 if (r < ctx->re_end && *r == ',')
815 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r));
816 max = tre_parse_int(&r, ctx->re_end);
819 /* Check that the repeat counts are sane. */
820 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
826 optionally followed immediately by a number == minimum repcount
827 optionally followed by , then a number == maximum repcount
835 /* Parse count limit settings */
838 while (r + 1 < ctx->re_end && !done)
856 } while (start != r);
859 if (r >= ctx->re_end)
862 /* Empty contents of {}. */
866 /* Parse the ending '}' or '\}'.*/
867 if (ctx->cflags & REG_EXTENDED)
869 if (r >= ctx->re_end || *r != '}')
875 if (r + 1 >= ctx->re_end
883 /* Create the AST node(s). */
884 if (min == 0 && max == 0)
886 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
892 if (min < 0 && max < 0)
893 /* Only approximate parameters set, no repetitions. */
896 *result = tre_ast_new_iter(ctx->mem, *result, min, max);
908 PARSE_MARK_FOR_SUBMATCH,
912 PARSE_POST_CATENATION,
917 } tre_parse_re_stack_symbol_t;
921 tre_parse(tre_parse_ctx_t *ctx)
923 tre_ast_node_t *result = NULL;
924 tre_parse_re_stack_symbol_t symbol;
925 reg_errcode_t status = REG_OK;
926 tre_stack_t *stack = ctx->stack;
927 int bottom = tre_stack_num_objects(stack);
930 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
931 ctx->len, ctx->re, ctx->len));
933 if (!ctx->nofirstsub)
935 STACK_PUSH(stack, ctx->re);
936 STACK_PUSH(stack, ctx->submatch_id);
937 STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH);
940 STACK_PUSH(stack, PARSE_RE);
941 ctx->re_start = ctx->re;
942 ctx->re_end = ctx->re + ctx->len;
945 /* The following is basically just a recursive descent parser. I use
946 an explicit stack instead of recursive functions mostly because of
947 two reasons: compatibility with systems which have an overflowable
948 call stack, and efficiency (both in lines of code and speed). */
949 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
951 if (status != REG_OK)
953 symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack);
957 /* Parse a full regexp. A regexp is one or more branches,
958 separated by the union operator `|'. */
959 if (ctx->cflags & REG_EXTENDED)
960 STACK_PUSHX(stack, PARSE_UNION);
961 STACK_PUSHX(stack, PARSE_BRANCH);
965 /* Parse a branch. A branch is one or more pieces, concatenated.
966 A piece is an atom possibly followed by a postfix operator. */
967 STACK_PUSHX(stack, PARSE_CATENATION);
968 STACK_PUSHX(stack, PARSE_PIECE);
972 /* Parse a piece. A piece is an atom possibly followed by one
973 or more postfix operators. */
974 STACK_PUSHX(stack, PARSE_POSTFIX);
975 STACK_PUSHX(stack, PARSE_ATOM);
978 case PARSE_CATENATION:
979 /* If the expression has not ended, parse another piece. */
982 if (ctx->re >= ctx->re_end)
985 if (ctx->cflags & REG_EXTENDED && c == '|')
987 if ((ctx->cflags & REG_EXTENDED
988 && c == ')' && depth > 0)
989 || (!(ctx->cflags & REG_EXTENDED)
991 && *(ctx->re + 1) == ')')))
993 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
995 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
996 ctx->re_end - ctx->re, ctx->re));
998 if (!(ctx->cflags & REG_EXTENDED))
1003 /* Left associative concatenation. */
1004 STACK_PUSHX(stack, PARSE_CATENATION);
1005 STACK_PUSHX(stack, result);
1006 STACK_PUSHX(stack, PARSE_POST_CATENATION);
1007 STACK_PUSHX(stack, PARSE_PIECE);
1011 case PARSE_POST_CATENATION:
1013 tre_ast_node_t *tree = tre_stack_pop(stack);
1014 tre_ast_node_t *tmp_node;
1015 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1023 if (ctx->re >= ctx->re_end)
1028 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1029 ctx->re_end - ctx->re, ctx->re));
1030 STACK_PUSHX(stack, PARSE_UNION);
1031 STACK_PUSHX(stack, result);
1032 STACK_PUSHX(stack, PARSE_POST_UNION);
1033 STACK_PUSHX(stack, PARSE_BRANCH);
1046 case PARSE_POST_UNION:
1048 tre_ast_node_t *tmp_node;
1049 tre_ast_node_t *tree = tre_stack_pop(stack);
1050 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1058 /* Parse postfix operators. */
1059 if (ctx->re >= ctx->re_end)
1065 if (!(ctx->cflags & REG_EXTENDED))
1069 tre_ast_node_t *tmp_node;
1072 if (*ctx->re == '+')
1074 if (*ctx->re == '?')
1078 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max);
1079 if (tmp_node == NULL)
1082 STACK_PUSHX(stack, PARSE_POSTFIX);
1087 /* "\{" is special without REG_EXTENDED */
1088 if (!(ctx->cflags & REG_EXTENDED)
1089 && ctx->re + 1 < ctx->re_end
1090 && *(ctx->re + 1) == '{')
1099 /* "{" is literal without REG_EXTENDED */
1100 if (!(ctx->cflags & REG_EXTENDED))
1104 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1105 ctx->re_end - ctx->re, ctx->re));
1108 status = tre_parse_bound(ctx, &result);
1109 if (status != REG_OK)
1111 STACK_PUSHX(stack, PARSE_POSTFIX);
1117 /* Parse an atom. An atom is a regular expression enclosed in `()',
1118 an empty set of `()', a bracket expression, `.', `^', `$',
1119 a `\' followed by a character, or a single character. */
1121 /* End of regexp? (empty string). */
1122 if (ctx->re >= ctx->re_end)
1127 case '(': /* parenthesized subexpression */
1129 if (ctx->cflags & REG_EXTENDED
1130 || (ctx->re > ctx->re_start
1131 && *(ctx->re - 1) == '\\'))
1135 DPRINT(("tre_parse: group begin: '%.*" STRF
1137 ctx->re_end - ctx->re, ctx->re,
1140 /* First parse a whole RE, then mark the resulting tree
1142 STACK_PUSHX(stack, ctx->submatch_id);
1143 STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH);
1144 STACK_PUSHX(stack, PARSE_RE);
1152 case ')': /* end of current subexpression */
1153 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1154 || (ctx->re > ctx->re_start
1155 && *(ctx->re - 1) == '\\'))
1157 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1158 ctx->re_end - ctx->re, ctx->re));
1159 /* We were expecting an atom, but instead the current
1160 subexpression was closed. POSIX leaves the meaning of
1161 this to be implementation-defined. We interpret this as
1162 an empty expression (which matches an empty string). */
1163 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1166 if (!(ctx->cflags & REG_EXTENDED))
1173 case '[': /* bracket expression */
1174 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1175 ctx->re_end - ctx->re, ctx->re));
1177 status = tre_parse_bracket(ctx, &result);
1178 if (status != REG_OK)
1183 /* If this is "\(" or "\)" chew off the backslash and
1185 if (!(ctx->cflags & REG_EXTENDED)
1186 && ctx->re + 1 < ctx->re_end
1187 && (*(ctx->re + 1) == '('
1188 || *(ctx->re + 1) == ')'))
1191 STACK_PUSHX(stack, PARSE_ATOM);
1195 if (ctx->re + 1 >= ctx->re_end)
1196 /* Trailing backslash. */
1199 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n",
1200 ctx->re_end - ctx->re, ctx->re));
1205 if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re))
1207 /* Back reference. */
1208 int val = *ctx->re - '0';
1209 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1210 ctx->re_end - ctx->re + 1, ctx->re - 1));
1211 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1216 ctx->max_backref = MAX(val, ctx->max_backref);
1221 /* Escaped character. */
1222 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1223 ctx->re_end - ctx->re + 1, ctx->re - 1));
1224 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1235 case '.': /* the any-symbol */
1236 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1237 ctx->re_end - ctx->re, ctx->re));
1238 if (ctx->cflags & REG_NEWLINE)
1240 tre_ast_node_t *tmp1;
1241 tre_ast_node_t *tmp2;
1242 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1246 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1250 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1257 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1266 case '^': /* beginning of line assertion */
1267 /* '^' has a special meaning everywhere in EREs, and in the
1268 beginning of the RE and after \( is BREs. */
1269 if (ctx->cflags & REG_EXTENDED
1270 || (ctx->re - 2 >= ctx->re_start
1271 && *(ctx->re - 2) == '\\'
1272 && *(ctx->re - 1) == '(')
1273 || ctx->re == ctx->re_start)
1275 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1276 ctx->re_end - ctx->re, ctx->re));
1277 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1287 case '$': /* end of line assertion. */
1288 /* '$' is special everywhere in EREs, and in the end of the
1289 string and before \) is BREs. */
1290 if (ctx->cflags & REG_EXTENDED
1291 || (ctx->re + 2 < ctx->re_end
1292 && *(ctx->re + 1) == '\\'
1293 && *(ctx->re + 2) == ')')
1294 || ctx->re + 1 == ctx->re_end)
1296 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1297 ctx->re_end - ctx->re, ctx->re));
1298 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1311 /* We are expecting an atom. If the subexpression (or the whole
1312 regexp ends here, we interpret it as an empty expression
1313 (which matches an empty string). */
1315 (ctx->re >= ctx->re_end
1317 || (ctx->cflags & REG_EXTENDED
1321 || *ctx->re == '?'))
1322 /* Test for "\)" in BRE mode. */
1323 || (!(ctx->cflags & REG_EXTENDED)
1324 && ctx->re + 1 < ctx->re_end
1326 && *(ctx->re + 1) == '{')))
1328 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1329 ctx->re_end - ctx->re, ctx->re));
1330 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1336 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1337 ctx->re_end - ctx->re, ctx->re));
1338 /* Note that we can't use an tre_isalpha() test here, since there
1339 may be characters which are alphabetic but neither upper or
1341 if (ctx->cflags & REG_ICASE
1342 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1344 tre_ast_node_t *tmp1;
1345 tre_ast_node_t *tmp2;
1347 /* XXX - Can there be more than one opposite-case
1348 counterpoints for some character in some locale? Or
1349 more than two characters which all should be regarded
1350 the same character if case is ignored? If yes, there
1351 does not seem to be a portable way to detect it. I guess
1352 that at least for multi-character collating elements there
1353 could be several opposite-case counterpoints, but they
1354 cannot be supported portably anyway. */
1355 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1356 tre_toupper(*ctx->re),
1360 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1361 tre_tolower(*ctx->re),
1365 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1371 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1382 case PARSE_MARK_FOR_SUBMATCH:
1384 int submatch_id = (int)tre_stack_pop(stack);
1386 if (result->submatch_id >= 0)
1388 tre_ast_node_t *n, *tmp_node;
1389 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1392 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1393 if (tmp_node == NULL)
1395 tmp_node->num_submatches = result->num_submatches;
1398 result->submatch_id = submatch_id;
1399 result->num_submatches++;
1403 case PARSE_RESTORE_CFLAGS:
1404 ctx->cflags = (int)tre_stack_pop(stack);
1409 /* Check for missing closing parentheses. */
1413 if (status == REG_OK)
1414 ctx->result = result;
1420 /***********************************************************************
1422 ***********************************************************************/
1425 Algorithms to setup tags so that submatch addressing can be done.
1429 /* Inserts a catenation node to the root of the tree given in `node'.
1430 As the left child a new tag with number `tag_id' to `node' is added,
1431 and the right child is the old root. */
1433 /* Inserts a catenation node to the root of the tree given in `node'.
1434 As the right child a new tag with number `tag_id' to `node' is added,
1435 and the left child is the old root. */
1436 static reg_errcode_t
1437 tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right)
1439 tre_catenation_t *c;
1440 tre_ast_node_t *child_tag, *child_old;
1442 DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id));
1444 c = tre_mem_alloc(mem, sizeof(*c));
1447 child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1);
1448 if (child_tag == NULL)
1450 child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1451 if (child_old == NULL)
1454 child_old->obj = node->obj;
1455 child_old->type = node->type;
1456 child_old->nullable = -1;
1457 child_old->submatch_id = -1;
1458 child_old->firstpos = NULL;
1459 child_old->lastpos = NULL;
1460 child_old->num_tags = 0;
1462 node->type = CATENATION;
1464 c->right = c->left = child_old;
1465 if (right) c->right = child_tag;
1466 else c->left = child_tag;
1473 ADDTAGS_AFTER_ITERATION,
1474 ADDTAGS_AFTER_UNION_LEFT,
1475 ADDTAGS_AFTER_UNION_RIGHT,
1476 ADDTAGS_AFTER_CAT_LEFT,
1477 ADDTAGS_AFTER_CAT_RIGHT,
1478 ADDTAGS_SET_SUBMATCH_END
1479 } tre_addtags_symbol_t;
1487 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1488 subexpressions marked for submatch addressing can be traced. */
1489 static reg_errcode_t
1490 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1493 reg_errcode_t status = REG_OK;
1494 tre_addtags_symbol_t symbol;
1495 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1496 int bottom = tre_stack_num_objects(stack);
1497 /* True for first pass (counting number of needed tags) */
1498 int first_pass = (mem == NULL || tnfa == NULL);
1499 int *regset, *orig_regset;
1500 int num_tags = 0; /* Total number of tags. */
1501 int tag = 0; /* The tag that is to be added next. */
1502 int next_tag = 1; /* Next tag to use after this one. */
1503 int *parents; /* Stack of submatches the current submatch is
1505 tre_tag_states_t *saved_states;
1507 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1511 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1515 orig_regset = regset;
1517 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1518 if (parents == NULL)
1525 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1526 if (saved_states == NULL)
1535 for (i = 0; i <= tnfa->num_submatches; i++)
1536 saved_states[i].tag = -1;
1539 STACK_PUSH(stack, node);
1540 STACK_PUSH(stack, ADDTAGS_RECURSE);
1542 while (tre_stack_num_objects(stack) > bottom)
1544 if (status != REG_OK)
1547 symbol = (tre_addtags_symbol_t)tre_stack_pop(stack);
1551 case ADDTAGS_SET_SUBMATCH_END:
1553 int id = (int)tre_stack_pop(stack);
1556 /* Add end of this submatch to regset. */
1557 for (i = 0; regset[i] >= 0; i++);
1558 regset[i] = id * 2 + 1;
1561 /* Pop this submatch from the parents stack. */
1562 for (i = 0; parents[i] >= 0; i++);
1563 parents[i - 1] = -1;
1567 case ADDTAGS_RECURSE:
1568 node = tre_stack_pop(stack);
1570 if (node->submatch_id >= 0)
1572 int id = node->submatch_id;
1576 /* Add start of this submatch to regset. */
1577 for (i = 0; regset[i] >= 0; i++);
1583 for (i = 0; parents[i] >= 0; i++);
1584 tnfa->submatch_data[id].parents = NULL;
1587 int *p = xmalloc(sizeof(*p) * (i + 1));
1590 status = REG_ESPACE;
1593 assert(tnfa->submatch_data[id].parents == NULL);
1594 tnfa->submatch_data[id].parents = p;
1595 for (i = 0; parents[i] >= 0; i++)
1601 /* Add end of this submatch to regset after processing this
1603 STACK_PUSHX(stack, node->submatch_id);
1604 STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END);
1611 tre_literal_t *lit = node->obj;
1613 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1616 DPRINT(("Literal %d-%d\n",
1617 (int)lit->code_min, (int)lit->code_max));
1620 /* Regset is not empty, so add a tag before the
1621 literal or backref. */
1624 status = tre_add_tag(mem, node, tag, 0 /*left*/);
1625 tnfa->tag_directions[tag] = direction;
1626 /* Go through the regset and set submatch data for
1627 submatches that are using this tag. */
1628 for (i = 0; regset[i] >= 0; i++)
1630 int id = regset[i] >> 1;
1631 int start = !(regset[i] & 1);
1632 DPRINT((" Using tag %d for %s offset of "
1633 "submatch %d\n", tag,
1634 start ? "start" : "end", id));
1636 tnfa->submatch_data[id].so_tag = tag;
1638 tnfa->submatch_data[id].eo_tag = tag;
1643 DPRINT((" num_tags = 1\n"));
1647 DPRINT((" num_tags++\n"));
1656 assert(!IS_TAG(lit));
1662 tre_catenation_t *cat = node->obj;
1663 tre_ast_node_t *left = cat->left;
1664 tre_ast_node_t *right = cat->right;
1665 int reserved_tag = -1;
1666 DPRINT(("Catenation, next_tag = %d\n", next_tag));
1669 /* After processing right child. */
1670 STACK_PUSHX(stack, node);
1671 STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT);
1673 /* Process right child. */
1674 STACK_PUSHX(stack, right);
1675 STACK_PUSHX(stack, ADDTAGS_RECURSE);
1677 /* After processing left child. */
1678 STACK_PUSHX(stack, next_tag + left->num_tags);
1679 DPRINT((" Pushing %d for after left\n",
1680 next_tag + left->num_tags));
1681 if (left->num_tags > 0 && right->num_tags > 0)
1683 /* Reserve the next tag to the right child. */
1684 DPRINT((" Reserving next_tag %d to right child\n",
1686 reserved_tag = next_tag;
1689 STACK_PUSHX(stack, reserved_tag);
1690 STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT);
1692 /* Process left child. */
1693 STACK_PUSHX(stack, left);
1694 STACK_PUSHX(stack, ADDTAGS_RECURSE);
1700 tre_iteration_t *iter = node->obj;
1701 DPRINT(("Iteration\n"));
1705 STACK_PUSHX(stack, regset[0] >= 0);
1709 STACK_PUSHX(stack, tag);
1711 STACK_PUSHX(stack, node);
1712 STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION);
1714 STACK_PUSHX(stack, iter->arg);
1715 STACK_PUSHX(stack, ADDTAGS_RECURSE);
1717 /* Regset is not empty, so add a tag here. */
1723 status = tre_add_tag(mem, node, tag, 0 /*left*/);
1724 tnfa->tag_directions[tag] = direction;
1725 /* Go through the regset and set submatch data for
1726 submatches that are using this tag. */
1727 for (i = 0; regset[i] >= 0; i++)
1729 int id = regset[i] >> 1;
1730 int start = !(regset[i] & 1);
1731 DPRINT((" Using tag %d for %s offset of "
1732 "submatch %d\n", tag,
1733 start ? "start" : "end", id));
1735 tnfa->submatch_data[id].so_tag = tag;
1737 tnfa->submatch_data[id].eo_tag = tag;
1741 DPRINT((" num_tags++\n"));
1747 direction = TRE_TAG_MINIMIZE;
1752 tre_union_t *uni = node->obj;
1753 tre_ast_node_t *left = uni->left;
1754 tre_ast_node_t *right = uni->right;
1760 left_tag = next_tag;
1761 right_tag = next_tag + 1;
1766 right_tag = next_tag;
1769 DPRINT(("Union\n"));
1771 /* After processing right child. */
1772 STACK_PUSHX(stack, right_tag);
1773 STACK_PUSHX(stack, left_tag);
1774 STACK_PUSHX(stack, regset);
1775 STACK_PUSHX(stack, regset[0] >= 0);
1776 STACK_PUSHX(stack, node);
1777 STACK_PUSHX(stack, right);
1778 STACK_PUSHX(stack, left);
1779 STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT);
1781 /* Process right child. */
1782 STACK_PUSHX(stack, right);
1783 STACK_PUSHX(stack, ADDTAGS_RECURSE);
1785 /* After processing left child. */
1786 STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT);
1788 /* Process left child. */
1789 STACK_PUSHX(stack, left);
1790 STACK_PUSHX(stack, ADDTAGS_RECURSE);
1792 /* Regset is not empty, so add a tag here. */
1798 status = tre_add_tag(mem, node, tag, 0 /*left*/);
1799 tnfa->tag_directions[tag] = direction;
1800 /* Go through the regset and set submatch data for
1801 submatches that are using this tag. */
1802 for (i = 0; regset[i] >= 0; i++)
1804 int id = regset[i] >> 1;
1805 int start = !(regset[i] & 1);
1806 DPRINT((" Using tag %d for %s offset of "
1807 "submatch %d\n", tag,
1808 start ? "start" : "end", id));
1810 tnfa->submatch_data[id].so_tag = tag;
1812 tnfa->submatch_data[id].eo_tag = tag;
1816 DPRINT((" num_tags++\n"));
1823 if (node->num_submatches > 0)
1825 /* The next two tags are reserved for markers. */
1835 if (node->submatch_id >= 0)
1838 /* Push this submatch on the parents stack. */
1839 for (i = 0; parents[i] >= 0; i++);
1840 parents[i] = node->submatch_id;
1841 parents[i + 1] = -1;
1844 break; /* end case: ADDTAGS_RECURSE */
1846 case ADDTAGS_AFTER_ITERATION:
1849 node = tre_stack_pop(stack);
1851 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1852 + (int)tre_stack_pop(stack);
1854 enter_tag = (int)tre_stack_pop(stack);
1856 DPRINT(("After iteration\n"));
1857 direction = TRE_TAG_MAXIMIZE;
1861 case ADDTAGS_AFTER_CAT_LEFT:
1863 int new_tag = (int)tre_stack_pop(stack);
1864 next_tag = (int)tre_stack_pop(stack);
1865 DPRINT(("After cat left, tag = %d, next_tag = %d\n",
1869 DPRINT((" Setting tag to %d\n", new_tag));
1875 case ADDTAGS_AFTER_CAT_RIGHT:
1876 DPRINT(("After cat right\n"));
1877 node = tre_stack_pop(stack);
1879 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1880 + ((tre_catenation_t *)node->obj)->right->num_tags;
1883 case ADDTAGS_AFTER_UNION_LEFT:
1884 DPRINT(("After union left\n"));
1885 /* Lift the bottom of the `regset' array so that when processing
1886 the right operand the items currently in the array are
1887 invisible. The original bottom was saved at ADDTAGS_UNION and
1888 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1889 while (*regset >= 0)
1893 case ADDTAGS_AFTER_UNION_RIGHT:
1895 int added_tags, tag_left, tag_right;
1896 tre_ast_node_t *left = tre_stack_pop(stack);
1897 tre_ast_node_t *right = tre_stack_pop(stack);
1898 DPRINT(("After union right\n"));
1899 node = tre_stack_pop(stack);
1900 added_tags = (int)tre_stack_pop(stack);
1903 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1904 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1905 + ((node->num_submatches > 0) ? 2 : 0);
1907 regset = tre_stack_pop(stack);
1908 tag_left = (int)tre_stack_pop(stack);
1909 tag_right = (int)tre_stack_pop(stack);
1911 /* Add tags after both children, the left child gets a smaller
1912 tag than the right child. This guarantees that we prefer
1913 the left child over the right child. */
1914 /* XXX - This is not always necessary (if the children have
1915 tags which must be seen for every match of that child). */
1916 /* XXX - Check if this is the only place where tre_add_tag_right
1917 is used. If so, use tre_add_tag_left (putting the tag before
1918 the child as opposed after the child) and throw away
1919 tre_add_tag_right. */
1920 if (node->num_submatches > 0)
1924 status = tre_add_tag(mem, left, tag_left, 1 /*right*/);
1925 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1926 status = tre_add_tag(mem, right, tag_right, 1 /*right*/);
1927 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1929 DPRINT((" num_tags += 2\n"));
1932 direction = TRE_TAG_MAXIMIZE;
1940 } /* end switch(symbol) */
1941 } /* end while(tre_stack_num_objects(stack) > bottom) */
1946 /* Go through the regset and set submatch data for
1947 submatches that are using this tag. */
1948 for (i = 0; regset[i] >= 0; i++)
1950 int id = regset[i] >> 1;
1951 int start = !(regset[i] & 1);
1952 DPRINT((" Using tag %d for %s offset of "
1953 "submatch %d\n", num_tags,
1954 start ? "start" : "end", id));
1956 tnfa->submatch_data[id].so_tag = num_tags;
1958 tnfa->submatch_data[id].eo_tag = num_tags;
1962 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
1963 first_pass? "First pass" : "Second pass", num_tags));
1965 assert(tree->num_tags == num_tags);
1966 tnfa->end_tag = num_tags;
1967 tnfa->num_tags = num_tags;
1970 xfree(saved_states);
1977 AST to TNFA compilation routines.
1983 } tre_copyast_symbol_t;
1985 /* Flags for tre_copy_ast(). */
1986 #define COPY_REMOVE_TAGS 1
1987 #define COPY_MAXIMIZE_FIRST_TAG 2
1989 static reg_errcode_t
1990 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1991 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1992 tre_ast_node_t **copy, int *max_pos)
1994 reg_errcode_t status = REG_OK;
1995 int bottom = tre_stack_num_objects(stack);
1998 tre_ast_node_t **result = copy;
1999 tre_copyast_symbol_t symbol;
2001 STACK_PUSH(stack, ast);
2002 STACK_PUSH(stack, COPY_RECURSE);
2004 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2006 tre_ast_node_t *node;
2007 if (status != REG_OK)
2010 symbol = (tre_copyast_symbol_t)tre_stack_pop(stack);
2013 case COPY_SET_RESULT_PTR:
2014 result = tre_stack_pop(stack);
2017 node = tre_stack_pop(stack);
2022 tre_literal_t *lit = node->obj;
2023 int pos = lit->position;
2024 int min = lit->code_min;
2025 int max = lit->code_max;
2026 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2028 /* XXX - e.g. [ab] has only one position but two
2029 nodes, so we are creating holes in the state space
2030 here. Not fatal, just wastes memory. */
2034 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2036 /* Change this tag to empty. */
2040 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2043 /* Maximize the first tag. */
2044 tag_directions[max] = TRE_TAG_MAXIMIZE;
2047 *result = tre_ast_new_literal(mem, min, max, pos);
2048 if (*result == NULL)
2049 status = REG_ESPACE;
2057 tre_union_t *uni = node->obj;
2059 *result = tre_ast_new_union(mem, uni->left, uni->right);
2060 if (*result == NULL)
2062 status = REG_ESPACE;
2065 copy = (*result)->obj;
2066 result = ©->left;
2067 STACK_PUSHX(stack, uni->right);
2068 STACK_PUSHX(stack, COPY_RECURSE);
2069 STACK_PUSHX(stack, ©->right);
2070 STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
2071 STACK_PUSHX(stack, uni->left);
2072 STACK_PUSHX(stack, COPY_RECURSE);
2077 tre_catenation_t *cat = node->obj;
2078 tre_catenation_t *copy;
2079 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2080 if (*result == NULL)
2082 status = REG_ESPACE;
2085 copy = (*result)->obj;
2088 result = ©->left;
2090 STACK_PUSHX(stack, cat->right);
2091 STACK_PUSHX(stack, COPY_RECURSE);
2092 STACK_PUSHX(stack, ©->right);
2093 STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
2094 STACK_PUSHX(stack, cat->left);
2095 STACK_PUSHX(stack, COPY_RECURSE);
2100 tre_iteration_t *iter = node->obj;
2101 STACK_PUSHX(stack, iter->arg);
2102 STACK_PUSHX(stack, COPY_RECURSE);
2103 *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max);
2104 if (*result == NULL)
2106 status = REG_ESPACE;
2109 iter = (*result)->obj;
2110 result = &iter->arg;
2120 *pos_add += num_copied;
2127 } tre_expand_ast_symbol_t;
2129 /* Expands each iteration node that has a finite nonzero minimum or maximum
2130 iteration count to a catenated sequence of copies of the node. */
2131 static reg_errcode_t
2132 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2133 int *position, tre_tag_direction_t *tag_directions,
2136 reg_errcode_t status = REG_OK;
2137 int bottom = tre_stack_num_objects(stack);
2139 int pos_add_total = 0;
2141 /* Approximate parameter nesting level. */
2144 STACK_PUSHR(stack, ast);
2145 STACK_PUSHR(stack, EXPAND_RECURSE);
2146 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2148 tre_ast_node_t *node;
2149 tre_expand_ast_symbol_t symbol;
2151 if (status != REG_OK)
2154 DPRINT(("pos_add %d\n", pos_add));
2156 symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack);
2157 node = tre_stack_pop(stack);
2160 case EXPAND_RECURSE:
2165 tre_literal_t *lit= node->obj;
2166 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2168 lit->position += pos_add;
2169 if (lit->position > max_pos)
2170 max_pos = lit->position;
2176 tre_union_t *uni = node->obj;
2177 STACK_PUSHX(stack, uni->right);
2178 STACK_PUSHX(stack, EXPAND_RECURSE);
2179 STACK_PUSHX(stack, uni->left);
2180 STACK_PUSHX(stack, EXPAND_RECURSE);
2185 tre_catenation_t *cat = node->obj;
2186 STACK_PUSHX(stack, cat->right);
2187 STACK_PUSHX(stack, EXPAND_RECURSE);
2188 STACK_PUSHX(stack, cat->left);
2189 STACK_PUSHX(stack, EXPAND_RECURSE);
2194 tre_iteration_t *iter = node->obj;
2195 STACK_PUSHX(stack, pos_add);
2196 STACK_PUSHX(stack, node);
2197 STACK_PUSHX(stack, EXPAND_AFTER_ITER);
2198 STACK_PUSHX(stack, iter->arg);
2199 STACK_PUSHX(stack, EXPAND_RECURSE);
2200 /* If we are going to expand this node at EXPAND_AFTER_ITER
2201 then don't increase the `pos' fields of the nodes now, it
2202 will get done when expanding. */
2203 if (iter->min > 1 || iter->max > 1)
2214 case EXPAND_AFTER_ITER:
2216 tre_iteration_t *iter = node->obj;
2218 pos_add = (int)tre_stack_pop(stack);
2219 pos_add_last = pos_add;
2220 if (iter->min > 1 || iter->max > 1)
2222 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2224 int pos_add_save = pos_add;
2226 /* Create a catenated sequence of copies of the node. */
2227 for (i = 0; i < iter->min; i++)
2229 tre_ast_node_t *copy;
2230 /* Remove tags from all but the last copy. */
2231 int flags = ((i + 1 < iter->min)
2233 : COPY_MAXIMIZE_FIRST_TAG);
2234 DPRINT((" pos_add %d\n", pos_add));
2235 pos_add_save = pos_add;
2236 status = tre_copy_ast(mem, stack, iter->arg, flags,
2237 &pos_add, tag_directions, ©,
2239 if (status != REG_OK)
2242 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2249 if (iter->max == -1)
2251 /* No upper limit. */
2252 pos_add_save = pos_add;
2253 status = tre_copy_ast(mem, stack, iter->arg, 0,
2254 &pos_add, NULL, &seq2, &max_pos);
2255 if (status != REG_OK)
2257 seq2 = tre_ast_new_iter(mem, seq2, 0, -1);
2263 for (i = iter->min; i < iter->max; i++)
2265 tre_ast_node_t *tmp, *copy;
2266 pos_add_save = pos_add;
2267 status = tre_copy_ast(mem, stack, iter->arg, 0,
2268 &pos_add, NULL, ©, &max_pos);
2269 if (status != REG_OK)
2272 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2277 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2280 seq2 = tre_ast_new_union(mem, tmp, seq2);
2286 pos_add = pos_add_save;
2289 else if (seq2 != NULL)
2290 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2293 node->obj = seq1->obj;
2294 node->type = seq1->type;
2298 pos_add_total += pos_add - pos_add_last;
2299 if (iter_depth == 0)
2300 pos_add = pos_add_total;
2310 *position += pos_add_total;
2312 /* `max_pos' should never be larger than `*position' if the above
2313 code works, but just an extra safeguard let's make sure
2314 `*position' is set large enough so enough memory will be
2315 allocated for the transition table. */
2316 if (max_pos > *position)
2317 *position = max_pos;
2320 DPRINT(("Expanded AST:\n"));
2322 DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
2328 static tre_pos_and_tags_t *
2329 tre_set_empty(tre_mem_t mem)
2331 tre_pos_and_tags_t *new_set;
2333 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2334 if (new_set == NULL)
2337 new_set[0].position = -1;
2338 new_set[0].code_min = -1;
2339 new_set[0].code_max = -1;
2344 static tre_pos_and_tags_t *
2345 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2346 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2348 tre_pos_and_tags_t *new_set;
2350 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2351 if (new_set == NULL)
2354 new_set[0].position = position;
2355 new_set[0].code_min = code_min;
2356 new_set[0].code_max = code_max;
2357 new_set[0].class = class;
2358 new_set[0].neg_classes = neg_classes;
2359 new_set[0].backref = backref;
2360 new_set[1].position = -1;
2361 new_set[1].code_min = -1;
2362 new_set[1].code_max = -1;
2367 static tre_pos_and_tags_t *
2368 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2369 int *tags, int assertions)
2372 tre_pos_and_tags_t *new_set;
2376 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2377 for (s1 = 0; set1[s1].position >= 0; s1++);
2378 for (s2 = 0; set2[s2].position >= 0; s2++);
2379 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2383 for (s1 = 0; set1[s1].position >= 0; s1++)
2385 new_set[s1].position = set1[s1].position;
2386 new_set[s1].code_min = set1[s1].code_min;
2387 new_set[s1].code_max = set1[s1].code_max;
2388 new_set[s1].assertions = set1[s1].assertions | assertions;
2389 new_set[s1].class = set1[s1].class;
2390 new_set[s1].neg_classes = set1[s1].neg_classes;
2391 new_set[s1].backref = set1[s1].backref;
2392 if (set1[s1].tags == NULL && tags == NULL)
2393 new_set[s1].tags = NULL;
2396 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2397 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2398 * (i + num_tags + 1)));
2399 if (new_tags == NULL)
2401 for (j = 0; j < i; j++)
2402 new_tags[j] = set1[s1].tags[j];
2403 for (i = 0; i < num_tags; i++)
2404 new_tags[j + i] = tags[i];
2405 new_tags[j + i] = -1;
2406 new_set[s1].tags = new_tags;
2410 for (s2 = 0; set2[s2].position >= 0; s2++)
2412 new_set[s1 + s2].position = set2[s2].position;
2413 new_set[s1 + s2].code_min = set2[s2].code_min;
2414 new_set[s1 + s2].code_max = set2[s2].code_max;
2415 /* XXX - why not | assertions here as well? */
2416 new_set[s1 + s2].assertions = set2[s2].assertions;
2417 new_set[s1 + s2].class = set2[s2].class;
2418 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2419 new_set[s1 + s2].backref = set2[s2].backref;
2420 if (set2[s2].tags == NULL)
2421 new_set[s1 + s2].tags = NULL;
2424 for (i = 0; set2[s2].tags[i] >= 0; i++);
2425 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2426 if (new_tags == NULL)
2428 for (j = 0; j < i; j++)
2429 new_tags[j] = set2[s2].tags[j];
2431 new_set[s1 + s2].tags = new_tags;
2434 new_set[s1 + s2].position = -1;
2438 /* Finds the empty path through `node' which is the one that should be
2439 taken according to POSIX.2 rules, and adds the tags on that path to
2440 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2441 set to the number of tags seen on the path. */
2442 static reg_errcode_t
2443 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2444 int *assertions, int *num_tags_seen)
2448 tre_catenation_t *cat;
2449 tre_iteration_t *iter;
2451 int bottom = tre_stack_num_objects(stack);
2452 reg_errcode_t status = REG_OK;
2456 status = tre_stack_push(stack, node);
2458 /* Walk through the tree recursively. */
2459 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2461 node = tre_stack_pop(stack);
2466 lit = (tre_literal_t *)node->obj;
2467 switch (lit->code_min)
2470 if (lit->code_max >= 0)
2474 /* Add the tag to `tags'. */
2475 for (i = 0; tags[i] >= 0; i++)
2476 if (tags[i] == lit->code_max)
2480 tags[i] = lit->code_max;
2489 assert(lit->code_max >= 1
2490 || lit->code_max <= ASSERT_LAST);
2491 if (assertions != NULL)
2492 *assertions |= lit->code_max;
2503 /* Subexpressions starting earlier take priority over ones
2504 starting later, so we prefer the left subexpression over the
2505 right subexpression. */
2506 uni = (tre_union_t *)node->obj;
2507 if (uni->left->nullable)
2508 STACK_PUSHX(stack, uni->left)
2509 else if (uni->right->nullable)
2510 STACK_PUSHX(stack, uni->right)
2516 /* The path must go through both children. */
2517 cat = (tre_catenation_t *)node->obj;
2518 assert(cat->left->nullable);
2519 assert(cat->right->nullable);
2520 STACK_PUSHX(stack, cat->left);
2521 STACK_PUSHX(stack, cat->right);
2525 /* A match with an empty string is preferred over no match at
2526 all, so we go through the argument if possible. */
2527 iter = (tre_iteration_t *)node->obj;
2528 if (iter->arg->nullable)
2529 STACK_PUSHX(stack, iter->arg);
2545 NFL_POST_CATENATION,
2547 } tre_nfl_stack_symbol_t;
2550 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2551 the nodes of the AST `tree'. */
2552 static reg_errcode_t
2553 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2555 int bottom = tre_stack_num_objects(stack);
2557 STACK_PUSHR(stack, tree);
2558 STACK_PUSHR(stack, NFL_RECURSE);
2560 while (tre_stack_num_objects(stack) > bottom)
2562 tre_nfl_stack_symbol_t symbol;
2563 tre_ast_node_t *node;
2565 symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack);
2566 node = tre_stack_pop(stack);
2574 tre_literal_t *lit = (tre_literal_t *)node->obj;
2575 if (IS_BACKREF(lit))
2577 /* Back references: nullable = false, firstpos = {i},
2580 node->firstpos = tre_set_one(mem, lit->position, 0,
2581 TRE_CHAR_MAX, 0, NULL, -1);
2582 if (!node->firstpos)
2584 node->lastpos = tre_set_one(mem, lit->position, 0,
2585 TRE_CHAR_MAX, 0, NULL,
2590 else if (lit->code_min < 0)
2592 /* Tags, empty strings and zero width assertions:
2593 nullable = true, firstpos = {}, and lastpos = {}. */
2595 node->firstpos = tre_set_empty(mem);
2596 if (!node->firstpos)
2598 node->lastpos = tre_set_empty(mem);
2604 /* Literal at position i: nullable = false, firstpos = {i},
2608 tre_set_one(mem, lit->position, lit->code_min,
2609 lit->code_max, 0, NULL, -1);
2610 if (!node->firstpos)
2612 node->lastpos = tre_set_one(mem, lit->position,
2613 lit->code_min, lit->code_max,
2614 lit->class, lit->neg_classes,
2623 /* Compute the attributes for the two subtrees, and after that
2625 STACK_PUSHR(stack, node);
2626 STACK_PUSHR(stack, NFL_POST_UNION);
2627 STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right);
2628 STACK_PUSHR(stack, NFL_RECURSE);
2629 STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left);
2630 STACK_PUSHR(stack, NFL_RECURSE);
2634 /* Compute the attributes for the two subtrees, and after that
2636 STACK_PUSHR(stack, node);
2637 STACK_PUSHR(stack, NFL_POST_CATENATION);
2638 STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right);
2639 STACK_PUSHR(stack, NFL_RECURSE);
2640 STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left);
2641 STACK_PUSHR(stack, NFL_RECURSE);
2645 /* Compute the attributes for the subtree, and after that for
2647 STACK_PUSHR(stack, node);
2648 STACK_PUSHR(stack, NFL_POST_ITERATION);
2649 STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg);
2650 STACK_PUSHR(stack, NFL_RECURSE);
2653 break; /* end case: NFL_RECURSE */
2655 case NFL_POST_UNION:
2657 tre_union_t *uni = (tre_union_t *)node->obj;
2658 node->nullable = uni->left->nullable || uni->right->nullable;
2659 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2660 uni->right->firstpos, NULL, 0);
2661 if (!node->firstpos)
2663 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2664 uni->right->lastpos, NULL, 0);
2670 case NFL_POST_ITERATION:
2672 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2674 if (iter->min == 0 || iter->arg->nullable)
2678 node->firstpos = iter->arg->firstpos;
2679 node->lastpos = iter->arg->lastpos;
2683 case NFL_POST_CATENATION:
2685 int num_tags, *tags, assertions;
2686 reg_errcode_t status;
2687 tre_catenation_t *cat = node->obj;
2688 node->nullable = cat->left->nullable && cat->right->nullable;
2690 /* Compute firstpos. */
2691 if (cat->left->nullable)
2693 /* The left side matches the empty string. Make a first pass
2694 with tre_match_empty() to get the number of tags. */
2695 status = tre_match_empty(stack, cat->left,
2696 NULL, NULL, &num_tags);
2697 if (status != REG_OK)
2699 /* Allocate arrays for the tags and parameters. */
2700 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2705 /* Second pass with tre_mach_empty() to get the list of
2707 status = tre_match_empty(stack, cat->left, tags,
2709 if (status != REG_OK)
2715 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2718 if (!node->firstpos)
2723 node->firstpos = cat->left->firstpos;
2726 /* Compute lastpos. */
2727 if (cat->right->nullable)
2729 /* The right side matches the empty string. Make a first pass
2730 with tre_match_empty() to get the number of tags. */
2731 status = tre_match_empty(stack, cat->right,
2732 NULL, NULL, &num_tags);
2733 if (status != REG_OK)
2735 /* Allocate arrays for the tags and parameters. */
2736 tags = xmalloc(sizeof(int) * (num_tags + 1));
2741 /* Second pass with tre_mach_empty() to get the list of
2743 status = tre_match_empty(stack, cat->right, tags,
2745 if (status != REG_OK)
2751 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2759 node->lastpos = cat->right->lastpos;
2774 /* Adds a transition from each position in `p1' to each position in `p2'. */
2775 static reg_errcode_t
2776 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2777 tre_tnfa_transition_t *transitions,
2778 int *counts, int *offs)
2780 tre_pos_and_tags_t *orig_p2 = p2;
2781 tre_tnfa_transition_t *trans;
2782 int i, j, k, l, dup, prev_p2_pos;
2784 if (transitions != NULL)
2785 while (p1->position >= 0)
2789 while (p2->position >= 0)
2791 /* Optimization: if this position was already handled, skip it. */
2792 if (p2->position == prev_p2_pos)
2797 prev_p2_pos = p2->position;
2798 /* Set `trans' to point to the next unused transition from
2799 position `p1->position'. */
2800 trans = transitions + offs[p1->position];
2801 while (trans->state != NULL)
2804 /* If we find a previous transition from `p1->position' to
2805 `p2->position', it is overwritten. This can happen only
2806 if there are nested loops in the regexp, like in "((a)*)*".
2807 In POSIX.2 repetition using the outer loop is always
2808 preferred over using the inner loop. Therefore the
2809 transition for the inner loop is useless and can be thrown
2811 /* XXX - The same position is used for all nodes in a bracket
2812 expression, so this optimization cannot be used (it will
2813 break bracket expressions) unless I figure out a way to
2815 if (trans->state_id == p2->position)
2824 if (trans->state == NULL)
2825 (trans + 1)->state = NULL;
2826 /* Use the character ranges, assertions, etc. from `p1' for
2827 the transition from `p1' to `p2'. */
2828 trans->code_min = p1->code_min;
2829 trans->code_max = p1->code_max;
2830 trans->state = transitions + offs[p2->position];
2831 trans->state_id = p2->position;
2832 trans->assertions = p1->assertions | p2->assertions
2833 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2834 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2835 if (p1->backref >= 0)
2837 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2838 assert(p2->backref < 0);
2839 trans->u.backref = p1->backref;
2840 trans->assertions |= ASSERT_BACKREF;
2843 trans->u.class = p1->class;
2844 if (p1->neg_classes != NULL)
2846 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2847 trans->neg_classes =
2848 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2849 if (trans->neg_classes == NULL)
2851 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2852 trans->neg_classes[i] = p1->neg_classes[i];
2853 trans->neg_classes[i] = (tre_ctype_t)0;
2856 trans->neg_classes = NULL;
2858 /* Find out how many tags this transition has. */
2860 if (p1->tags != NULL)
2861 while(p1->tags[i] >= 0)
2864 if (p2->tags != NULL)
2865 while(p2->tags[j] >= 0)
2868 /* If we are overwriting a transition, free the old tag array. */
2869 if (trans->tags != NULL)
2873 /* If there were any tags, allocate an array and fill it. */
2876 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2880 if (p1->tags != NULL)
2881 while(p1->tags[i] >= 0)
2883 trans->tags[i] = p1->tags[i];
2888 if (p2->tags != NULL)
2889 while (p2->tags[j] >= 0)
2891 /* Don't add duplicates. */
2893 for (k = 0; k < i; k++)
2894 if (trans->tags[k] == p2->tags[j])
2900 trans->tags[l++] = p2->tags[j];
2903 trans->tags[l] = -1;
2911 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
2913 if (p1->code_max != p1->code_min)
2914 DPRINT(("-%3d", p1->code_max));
2918 DPRINT((", tags ["));
2921 DPRINT(("%d", *tags));
2928 if (trans->assertions)
2929 DPRINT((", assert %d", trans->assertions));
2930 if (trans->assertions & ASSERT_BACKREF)
2931 DPRINT((", backref %d", trans->u.backref));
2932 else if (trans->class)
2933 DPRINT((", class %ld", (long)trans->class));
2934 if (trans->neg_classes)
2935 DPRINT((", neg_classes %p", trans->neg_classes));
2938 #endif /* TRE_DEBUG */
2944 /* Compute a maximum limit for the number of transitions leaving
2946 while (p1->position >= 0)
2949 while (p2->position >= 0)
2951 counts[p1->position]++;
2959 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
2960 labelled with one character range (there are no transitions on empty
2961 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
2963 static reg_errcode_t
2964 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2965 int *counts, int *offs)
2968 tre_catenation_t *cat;
2969 tre_iteration_t *iter;
2970 reg_errcode_t errcode = REG_OK;
2972 /* XXX - recurse using a stack!. */
2978 uni = (tre_union_t *)node->obj;
2979 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2980 if (errcode != REG_OK)
2982 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2986 cat = (tre_catenation_t *)node->obj;
2987 /* Add a transition from each position in cat->left->lastpos
2988 to each position in cat->right->firstpos. */
2989 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2990 transitions, counts, offs);
2991 if (errcode != REG_OK)
2993 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2994 if (errcode != REG_OK)
2996 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3000 iter = (tre_iteration_t *)node->obj;
3001 assert(iter->max == -1 || iter->max == 1);
3003 if (iter->max == -1)
3005 assert(iter->min == 0 || iter->min == 1);
3006 /* Add a transition from each last position in the iterated
3007 expression to each first position. */
3008 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3009 transitions, counts, offs);
3010 if (errcode != REG_OK)
3013 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3021 tre_free(regex_t *preg)
3025 tre_tnfa_transition_t *trans;
3027 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3031 for (i = 0; i < tnfa->num_transitions; i++)
3032 if (tnfa->transitions[i].state)
3034 if (tnfa->transitions[i].tags)
3035 xfree(tnfa->transitions[i].tags);
3036 if (tnfa->transitions[i].neg_classes)
3037 xfree(tnfa->transitions[i].neg_classes);
3039 if (tnfa->transitions)
3040 xfree(tnfa->transitions);
3044 for (trans = tnfa->initial; trans->state; trans++)
3049 xfree(tnfa->initial);
3052 if (tnfa->submatch_data)
3054 for (i = 0; i < tnfa->num_submatches; i++)
3055 if (tnfa->submatch_data[i].parents)
3056 xfree(tnfa->submatch_data[i].parents);
3057 xfree(tnfa->submatch_data);
3060 if (tnfa->tag_directions)
3061 xfree(tnfa->tag_directions);
3066 #define ERROR_EXIT(err) \
3070 if (1) goto error_exit; \
3076 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
3079 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3080 tre_pos_and_tags_t *p;
3081 int *counts = NULL, *offs = NULL;
3083 tre_tnfa_transition_t *transitions, *initial;
3084 tre_tnfa_t *tnfa = NULL;
3085 tre_submatch_data_t *submatch_data;
3086 tre_tag_direction_t *tag_directions = NULL;
3087 reg_errcode_t errcode;
3090 /* Parse context. */
3091 tre_parse_ctx_t parse_ctx;
3093 /* Allocate a stack used throughout the compilation process for various
3095 stack = tre_stack_new(512, 10240, 128);
3098 /* Allocate a fast memory allocator. */
3099 mem = tre_mem_new();
3102 tre_stack_destroy(stack);
3106 /* Parse the regexp. */
3107 memset(&parse_ctx, 0, sizeof(parse_ctx));
3108 parse_ctx.mem = mem;
3109 parse_ctx.stack = stack;
3110 parse_ctx.re = regex;
3112 parse_ctx.cflags = cflags;
3113 parse_ctx.max_backref = -1;
3114 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex));
3115 errcode = tre_parse(&parse_ctx);
3116 if (errcode != REG_OK)
3117 ERROR_EXIT(errcode);
3118 preg->re_nsub = parse_ctx.submatch_id - 1;
3119 tree = parse_ctx.result;
3122 tre_ast_print(tree);
3123 #endif /* TRE_DEBUG */
3125 /* Referring to nonexistent subexpressions is illegal. */
3126 if (parse_ctx.max_backref > (int)preg->re_nsub)
3127 ERROR_EXIT(REG_ESUBREG);
3129 /* Allocate the TNFA struct. */
3130 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3132 ERROR_EXIT(REG_ESPACE);
3133 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3134 tnfa->num_submatches = parse_ctx.submatch_id;
3136 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3137 regexp does not have back references, this can be skipped. */
3138 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3140 DPRINT(("tre_compile: setting up tags\n"));
3142 /* Figure out how many tags we will need. */
3143 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3144 if (errcode != REG_OK)
3145 ERROR_EXIT(errcode);
3147 tre_ast_print(tree);
3148 #endif /* TRE_DEBUG */
3150 if (tnfa->num_tags > 0)
3152 tag_directions = xmalloc(sizeof(*tag_directions)
3153 * (tnfa->num_tags + 1));
3154 if (tag_directions == NULL)
3155 ERROR_EXIT(REG_ESPACE);
3156 tnfa->tag_directions = tag_directions;
3157 memset(tag_directions, -1,
3158 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3161 submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
3162 if (submatch_data == NULL)
3163 ERROR_EXIT(REG_ESPACE);
3164 tnfa->submatch_data = submatch_data;
3166 errcode = tre_add_tags(mem, stack, tree, tnfa);
3167 if (errcode != REG_OK)
3168 ERROR_EXIT(errcode);
3171 for (i = 0; i < parse_ctx.submatch_id; i++)
3172 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
3173 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
3174 for (i = 0; i < tnfa->num_tags; i++)
3175 DPRINT(("t%d is %s\n", i,
3176 tag_directions[i] == TRE_TAG_MINIMIZE ?
3177 "minimized" : "maximized"));
3178 #endif /* TRE_DEBUG */
3181 /* Expand iteration nodes. */
3182 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3183 tag_directions, NULL);
3184 if (errcode != REG_OK)
3185 ERROR_EXIT(errcode);
3187 /* Add a dummy node for the final state.
3188 XXX - For certain patterns this dummy node can be optimized away,
3189 for example "a*" or "ab*". Figure out a simple way to detect
3190 this possibility. */
3192 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3193 if (tmp_ast_r == NULL)
3194 ERROR_EXIT(REG_ESPACE);
3196 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3198 ERROR_EXIT(REG_ESPACE);
3201 tre_ast_print(tree);
3202 DPRINT(("Number of states: %d\n", parse_ctx.position));
3203 #endif /* TRE_DEBUG */
3205 errcode = tre_compute_nfl(mem, stack, tree);
3206 if (errcode != REG_OK)
3207 ERROR_EXIT(errcode);
3209 counts = xmalloc(sizeof(int) * parse_ctx.position);
3211 ERROR_EXIT(REG_ESPACE);
3213 offs = xmalloc(sizeof(int) * parse_ctx.position);
3215 ERROR_EXIT(REG_ESPACE);
3217 for (i = 0; i < parse_ctx.position; i++)
3219 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3222 for (i = 0; i < parse_ctx.position; i++)
3225 add += counts[i] + 1;
3228 transitions = xcalloc(add + 1, sizeof(*transitions));
3229 if (transitions == NULL)
3230 ERROR_EXIT(REG_ESPACE);
3231 tnfa->transitions = transitions;
3232 tnfa->num_transitions = add;
3234 DPRINT(("Converting to TNFA:\n"));
3235 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3236 if (errcode != REG_OK)
3237 ERROR_EXIT(errcode);
3241 while (p->position >= 0)
3248 DPRINT(("initial: %d", p->position));
3256 DPRINT(("%d", *tags));
3262 DPRINT((", assert %d", p->assertions));
3265 #endif /* TRE_DEBUG */
3270 initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
3271 if (initial == NULL)
3272 ERROR_EXIT(REG_ESPACE);
3273 tnfa->initial = initial;
3276 for (p = tree->firstpos; p->position >= 0; p++)
3278 initial[i].state = transitions + offs[p->position];
3279 initial[i].state_id = p->position;
3280 initial[i].tags = NULL;
3281 /* Copy the arrays p->tags, they are allocated
3282 from a tre_mem object. */
3286 for (j = 0; p->tags[j] >= 0; j++);
3287 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3288 if (!initial[i].tags)
3289 ERROR_EXIT(REG_ESPACE);
3290 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3292 initial[i].assertions = p->assertions;
3295 initial[i].state = NULL;
3297 tnfa->num_transitions = add;
3298 tnfa->final = transitions + offs[tree->lastpos[0].position];
3299 tnfa->num_states = parse_ctx.position;
3300 tnfa->cflags = cflags;
3302 DPRINT(("final state %p\n", (void *)tnfa->final));
3304 tre_mem_destroy(mem);
3305 tre_stack_destroy(stack);
3309 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3313 /* Free everything that was allocated and return the error code. */
3314 tre_mem_destroy(mem);
3316 tre_stack_destroy(stack);
3321 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3327 /***********************************************************************
3329 ***********************************************************************/
3332 regcomp(regex_t *preg, const char *regex, int cflags)
3336 size_t n = strlen(regex);
3338 if (n+1 > SIZE_MAX/sizeof(tre_char_t))
3340 wregex = xmalloc(sizeof(tre_char_t) * (n + 1));
3344 n = mbstowcs(wregex, regex, n+1);
3345 if (n == (size_t)-1) {
3350 ret = tre_compile(preg, wregex, n, cflags);
3357 regfree(regex_t *preg)