2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 /***********************************************************************
44 ***********************************************************************/
53 tre_ctype_t *neg_classes;
58 /***********************************************************************
59 from tre-ast.c and tre-ast.h
60 ***********************************************************************/
62 /* The different AST node types. */
70 /* Special subtypes of TRE_LITERAL. */
71 #define EMPTY -1 /* Empty leaf (denotes empty string). */
72 #define ASSERTION -2 /* Assertion leaf. */
73 #define TAG -3 /* Tag leaf. */
74 #define BACKREF -4 /* Back reference leaf. */
76 #define IS_SPECIAL(x) ((x)->code_min < 0)
77 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
78 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
79 #define IS_TAG(x) ((x)->code_min == TAG)
80 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
83 /* A generic AST node. All AST nodes consist of this node on the top
84 level with `obj' pointing to the actual content. */
86 tre_ast_type_t type; /* Type of the node. */
87 void *obj; /* Pointer to actual node. */
92 tre_pos_and_tags_t *firstpos;
93 tre_pos_and_tags_t *lastpos;
97 /* A "literal" node. These are created for assertions, back references,
98 tags, matching parameter settings, and all expressions that match one
105 tre_ctype_t *neg_classes;
108 /* A "catenation" node. These are created when two regexps are concatenated.
109 If there are more than one subexpressions in sequence, the `left' part
110 holds all but the last, and `right' part holds the last subexpression
111 (catenation is left associative). */
113 tre_ast_node_t *left;
114 tre_ast_node_t *right;
117 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
120 /* Subexpression to match. */
122 /* Minimum number of consecutive matches. */
124 /* Maximum number of consecutive matches. */
126 /* If 0, match as many characters as possible, if 1 match as few as
127 possible. Note that this does not always mean the same thing as
128 matching as many/few repetitions as possible. */
129 unsigned int minimal:1;
132 /* An "union" node. These are created for the "|" operator. */
134 tre_ast_node_t *left;
135 tre_ast_node_t *right;
138 static tre_ast_node_t *
139 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
141 static tre_ast_node_t *
142 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
144 static tre_ast_node_t *
145 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
148 static tre_ast_node_t *
149 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
151 static tre_ast_node_t *
152 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
153 tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
159 tre_ast_node_t *node;
161 node = tre_mem_calloc(mem, sizeof(*node));
164 node->obj = tre_mem_calloc(mem, size);
169 node->submatch_id = -1;
174 static tre_ast_node_t *
175 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
177 tre_ast_node_t *node;
180 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
184 lit->code_min = code_min;
185 lit->code_max = code_max;
186 lit->position = position;
191 static tre_ast_node_t *
192 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
195 tre_ast_node_t *node;
196 tre_iteration_t *iter;
198 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
205 iter->minimal = minimal;
206 node->num_submatches = arg->num_submatches;
211 static tre_ast_node_t *
212 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
214 tre_ast_node_t *node;
216 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
219 ((tre_union_t *)node->obj)->left = left;
220 ((tre_union_t *)node->obj)->right = right;
221 node->num_submatches = left->num_submatches + right->num_submatches;
226 static tre_ast_node_t *
227 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
228 tre_ast_node_t *right)
230 tre_ast_node_t *node;
232 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
235 ((tre_catenation_t *)node->obj)->left = left;
236 ((tre_catenation_t *)node->obj)->right = right;
237 node->num_submatches = left->num_submatches + right->num_submatches;
243 /***********************************************************************
244 from tre-stack.c and tre-stack.h
245 ***********************************************************************/
247 typedef struct tre_stack_rec tre_stack_t;
249 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
250 is maximum size, and `increment' specifies how much more space will be
251 allocated with realloc() if all space gets used up. Returns the stack
252 object or NULL if out of memory. */
254 tre_stack_new(int size, int max_size, int increment);
256 /* Frees the stack object. */
258 tre_stack_destroy(tre_stack_t *s);
260 /* Returns the current number of objects in the stack. */
262 tre_stack_num_objects(tre_stack_t *s);
264 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
265 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
266 This tries to realloc() more space before failing if maximum size
267 has not yet been reached. Returns REG_OK if successful. */
268 #define declare_pushf(typetag, type) \
269 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
271 declare_pushf(voidptr, void *);
272 declare_pushf(int, int);
274 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
275 element off of stack `s' and returns it. The stack must not be
277 #define declare_popf(typetag, type) \
278 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
280 declare_popf(voidptr, void *);
281 declare_popf(int, int);
283 /* Just to save some typing. */
284 #define STACK_PUSH(s, typetag, value) \
287 status = tre_stack_push_ ## typetag(s, value); \
289 while (/*CONSTCOND*/0)
291 #define STACK_PUSHX(s, typetag, value) \
293 status = tre_stack_push_ ## typetag(s, value); \
294 if (status != REG_OK) \
298 #define STACK_PUSHR(s, typetag, value) \
300 reg_errcode_t _status; \
301 _status = tre_stack_push_ ## typetag(s, value); \
302 if (_status != REG_OK) \
306 union tre_stack_item {
311 struct tre_stack_rec {
316 union tre_stack_item *stack;
321 tre_stack_new(int size, int max_size, int increment)
325 s = xmalloc(sizeof(*s));
328 s->stack = xmalloc(sizeof(*s->stack) * size);
329 if (s->stack == NULL)
335 s->max_size = max_size;
336 s->increment = increment;
343 tre_stack_destroy(tre_stack_t *s)
350 tre_stack_num_objects(tre_stack_t *s)
356 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
358 if (s->ptr < s->size)
360 s->stack[s->ptr] = value;
365 if (s->size >= s->max_size)
371 union tre_stack_item *new_buffer;
373 new_size = s->size + s->increment;
374 if (new_size > s->max_size)
375 new_size = s->max_size;
376 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
377 if (new_buffer == NULL)
381 assert(new_size > s->size);
383 s->stack = new_buffer;
384 tre_stack_push(s, value);
390 #define define_pushf(typetag, type) \
391 declare_pushf(typetag, type) { \
392 union tre_stack_item item; \
393 item.typetag ## _value = value; \
394 return tre_stack_push(s, item); \
397 define_pushf(int, int)
398 define_pushf(voidptr, void *)
400 #define define_popf(typetag, type) \
401 declare_popf(typetag, type) { \
402 return s->stack[--s->ptr].typetag ## _value; \
405 define_popf(int, int)
406 define_popf(voidptr, void *)
409 /***********************************************************************
410 from tre-parse.c and tre-parse.h
411 ***********************************************************************/
415 /* Memory allocator. The AST is allocated using this. */
417 /* Stack used for keeping track of regexp syntax. */
419 /* The parse result. */
420 tre_ast_node_t *result;
421 /* The regexp to parse and its length. */
423 /* The first character of the entire regexp. */
424 const char *re_start;
425 /* Current submatch ID. */
427 /* Current position (number of literal). */
429 /* The highest back reference or -1 if none seen so far. */
431 /* This flag is set if the regexp uses approximate matching. */
433 /* Compilation flags. */
435 /* If this flag is set the top-level submatch is not captured. */
439 /* Parses a wide character regexp pattern into a syntax tree. This parser
440 handles both syntaxes (BRE and ERE), including the TRE extensions. */
442 tre_parse(tre_parse_ctx_t *ctx);
446 This parser is just a simple recursive descent parser for POSIX.2
447 regexps. The parser supports both the obsolete default syntax and
448 the "extended" syntax, and some nonstandard extensions.
451 /* Characters with special meanings in regexp syntax. */
452 #define CHAR_PIPE '|'
453 #define CHAR_LPAREN '('
454 #define CHAR_RPAREN ')'
455 #define CHAR_LBRACE '{'
456 #define CHAR_RBRACE '}'
457 #define CHAR_LBRACKET '['
458 #define CHAR_RBRACKET ']'
459 #define CHAR_MINUS '-'
460 #define CHAR_STAR '*'
461 #define CHAR_QUESTIONMARK '?'
462 #define CHAR_PLUS '+'
463 #define CHAR_PERIOD '.'
464 #define CHAR_COLON ':'
465 #define CHAR_EQUAL '='
466 #define CHAR_COMMA ','
467 #define CHAR_CARET '^'
468 #define CHAR_DOLLAR '$'
469 #define CHAR_BACKSLASH '\\'
470 #define CHAR_HASH '#'
471 #define CHAR_TILDE '~'
474 /* Some macros for expanding \w, \s, etc. */
475 static const struct tre_macro_struct {
477 const char *expansion;
479 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
480 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
481 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
482 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
487 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
488 must have at least `len' items. Sets buf[0] to zero if the there
489 is no match in `tre_macros'. */
491 tre_expand_macro(const char *regex)
498 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
499 return tre_macros[i].expansion;
503 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
504 tre_ast_node_t ***items)
506 reg_errcode_t status;
507 tre_ast_node_t **array = *items;
508 /* Allocate more space if necessary. */
511 tre_ast_node_t **new_items;
512 /* If the array is already 1024 items large, give up -- there's
513 probably an error in the regexp (e.g. not a '\0' terminated
514 string and missing ']') */
518 new_items = xrealloc(array, sizeof(*array) * *max_i);
519 if (new_items == NULL)
521 *items = array = new_items;
523 array[*i] = tre_ast_new_literal(mem, min, max, -1);
524 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
531 tre_compare_items(const void *a, const void *b)
533 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
534 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
535 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
536 int a_min = l_a->code_min, b_min = l_b->code_min;
540 else if (a_min > b_min)
546 /* Maximum number of character classes that can occur in a negated bracket
548 #define MAX_NEG_CLASSES 64
550 /* Maximum length of character class names. */
551 #define MAX_CLASS_NAME
554 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
555 tre_ctype_t neg_classes[], int *num_neg_classes,
556 tre_ast_node_t ***items, int *num_items,
559 const char *re = ctx->re;
560 reg_errcode_t status = REG_OK;
561 tre_ctype_t class = (tre_ctype_t)0;
563 int max_i = *items_size;
566 /* Build an array of the items in the bracket expression. */
567 while (status == REG_OK)
574 else if (*re == CHAR_RBRACKET && re > ctx->re)
581 tre_cint_t min = 0, max = 0;
583 int clen = mbtowc(&wc, re, -1);
585 if (clen<0) clen=1, wc=WEOF;
587 class = (tre_ctype_t)0;
588 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
592 clen = mbtowc(&wc, re, -1);
593 if (clen<0) clen=1, wc=WEOF;
596 /* XXX - Should use collation order instead of encoding values
597 in character ranges. */
601 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
602 status = REG_ECOLLATE;
603 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
604 status = REG_ECOLLATE;
605 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
608 const char *endptr = re + 2;
610 while (*endptr && *endptr != CHAR_COLON)
614 len = MIN(endptr - re - 2, 63);
615 strncpy(tmp_str, re + 2, len);
617 class = tre_ctype(tmp_str);
629 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
631 /* Two ranges are not allowed to share and endpoint. */
637 if (status != REG_OK)
641 if (*num_neg_classes >= MAX_NEG_CLASSES)
644 neg_classes[(*num_neg_classes)++] = class;
647 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
648 if (status != REG_OK)
650 ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
653 /* Add opposite-case counterpoints if REG_ICASE is present.
654 This is broken if there are more than two "same" characters. */
655 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
657 tre_cint_t cmin, ccurr;
661 if (tre_islower(min))
663 cmin = ccurr = tre_toupper(min++);
664 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
666 ccurr = tre_toupper(min++);
667 status = tre_new_item(ctx->mem, cmin, ccurr,
670 else if (tre_isupper(min))
672 cmin = ccurr = tre_tolower(min++);
673 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
675 ccurr = tre_tolower(min++);
676 status = tre_new_item(ctx->mem, cmin, ccurr,
680 if (status != REG_OK)
683 if (status != REG_OK)
695 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
697 tre_ast_node_t *node = NULL;
699 reg_errcode_t status = REG_OK;
700 tre_ast_node_t **items, *u, *n;
701 int i = 0, j, max_i = 32, curr_max, curr_min;
702 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
703 int num_neg_classes = 0;
705 /* Start off with an array of `max_i' elements. */
706 items = xmalloc(sizeof(*items) * max_i);
710 if (*ctx->re == CHAR_CARET)
716 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
719 if (status != REG_OK)
720 goto parse_bracket_done;
722 /* Sort the array if we need to negate it. */
724 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
726 curr_max = curr_min = 0;
727 /* Build a union of the items in the array, negated if necessary. */
728 for (j = 0; j < i && status == REG_OK; j++)
731 tre_literal_t *l = items[j]->obj;
740 curr_max = MAX(max + 1, curr_max);
747 if (curr_max >= curr_min)
749 l->code_min = curr_min;
750 l->code_max = curr_max;
756 curr_min = curr_max = max + 1;
763 l->position = ctx->position;
764 if (num_neg_classes > 0)
766 l->neg_classes = tre_mem_alloc(ctx->mem,
767 (sizeof(*l->neg_classes)
768 * (num_neg_classes + 1)));
769 if (l->neg_classes == NULL)
774 for (k = 0; k < num_neg_classes; k++)
775 l->neg_classes[k] = neg_classes[k];
776 l->neg_classes[k] = (tre_ctype_t)0;
779 l->neg_classes = NULL;
784 u = tre_ast_new_union(ctx->mem, node, items[j]);
792 if (status != REG_OK)
793 goto parse_bracket_done;
798 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
803 tre_literal_t *l = n->obj;
804 if (num_neg_classes > 0)
806 l->neg_classes = tre_mem_alloc(ctx->mem,
807 (sizeof(*l->neg_classes)
808 * (num_neg_classes + 1)));
809 if (l->neg_classes == NULL)
812 goto parse_bracket_done;
814 for (k = 0; k < num_neg_classes; k++)
815 l->neg_classes[k] = neg_classes[k];
816 l->neg_classes[k] = (tre_ctype_t)0;
819 l->neg_classes = NULL;
824 u = tre_ast_new_union(ctx->mem, node, n);
832 if (status != REG_OK)
833 goto parse_bracket_done;
837 #endif /* TRE_DEBUG */
847 /* Parses a positive decimal integer. Returns -1 if the string does not
848 contain a valid number. */
850 tre_parse_int(const char **regex)
853 const char *r = *regex;
858 num = num * 10 + *r - '0';
867 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
870 const char *r = ctx->re;
873 /* Parse number (minimum repetition count). */
875 if (*r >= '0' && *r <= '9') {
876 min = tre_parse_int(&r);
879 /* Parse comma and second number (maximum repetition count). */
881 if (*r == CHAR_COMMA)
884 max = tre_parse_int(&r);
887 /* Check that the repeat counts are sane. */
888 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
895 /* Empty contents of {}. */
899 /* Parse the ending '}' or '\}'.*/
900 if (ctx->cflags & REG_EXTENDED)
902 if (*r != CHAR_RBRACE)
908 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
913 /* Create the AST node(s). */
914 if (min == 0 && max == 0)
916 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
922 if (min < 0 && max < 0)
923 /* Only approximate parameters set, no repetitions. */
926 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
938 PARSE_MARK_FOR_SUBMATCH,
942 PARSE_POST_CATENATION,
947 } tre_parse_re_stack_symbol_t;
951 tre_parse(tre_parse_ctx_t *ctx)
953 tre_ast_node_t *result = NULL;
954 tre_parse_re_stack_symbol_t symbol;
955 reg_errcode_t status = REG_OK;
956 tre_stack_t *stack = ctx->stack;
957 int bottom = tre_stack_num_objects(stack);
962 if (!ctx->nofirstsub)
964 STACK_PUSH(stack, int, ctx->submatch_id);
965 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
968 STACK_PUSH(stack, int, PARSE_RE);
969 ctx->re_start = ctx->re;
972 /* The following is basically just a recursive descent parser. I use
973 an explicit stack instead of recursive functions mostly because of
974 two reasons: compatibility with systems which have an overflowable
975 call stack, and efficiency (both in lines of code and speed). */
976 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
978 if (status != REG_OK)
980 symbol = tre_stack_pop_int(stack);
984 /* Parse a full regexp. A regexp is one or more branches,
985 separated by the union operator `|'. */
986 if (ctx->cflags & REG_EXTENDED)
987 STACK_PUSHX(stack, int, PARSE_UNION);
988 STACK_PUSHX(stack, int, PARSE_BRANCH);
992 /* Parse a branch. A branch is one or more pieces, concatenated.
993 A piece is an atom possibly followed by a postfix operator. */
994 STACK_PUSHX(stack, int, PARSE_CATENATION);
995 STACK_PUSHX(stack, int, PARSE_PIECE);
999 /* Parse a piece. A piece is an atom possibly followed by one
1000 or more postfix operators. */
1001 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1002 STACK_PUSHX(stack, int, PARSE_ATOM);
1005 case PARSE_CATENATION:
1006 /* If the expression has not ended, parse another piece. */
1012 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1014 if ((ctx->cflags & REG_EXTENDED
1015 && c == CHAR_RPAREN && depth > 0)
1016 || (!(ctx->cflags & REG_EXTENDED)
1017 && (c == CHAR_BACKSLASH
1018 && *(ctx->re + 1) == CHAR_RPAREN)))
1020 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1021 status = REG_EPAREN;
1023 if (!(ctx->cflags & REG_EXTENDED))
1029 /* Default case, left associative concatenation. */
1030 STACK_PUSHX(stack, int, PARSE_CATENATION);
1031 STACK_PUSHX(stack, voidptr, result);
1032 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1033 STACK_PUSHX(stack, int, PARSE_PIECE);
1038 case PARSE_POST_CATENATION:
1040 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1041 tre_ast_node_t *tmp_node;
1042 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1053 STACK_PUSHX(stack, int, PARSE_UNION);
1054 STACK_PUSHX(stack, voidptr, result);
1055 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1056 STACK_PUSHX(stack, int, PARSE_BRANCH);
1069 case PARSE_POST_UNION:
1071 tre_ast_node_t *tmp_node;
1072 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1073 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1081 /* Parse postfix operators. */
1085 case CHAR_QUESTIONMARK:
1086 if (!(ctx->cflags & REG_EXTENDED))
1091 tre_ast_node_t *tmp_node;
1096 if (*ctx->re == CHAR_PLUS)
1098 if (*ctx->re == CHAR_QUESTIONMARK)
1102 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1104 if (tmp_node == NULL)
1107 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1111 case CHAR_BACKSLASH:
1112 /* "\{" is special without REG_EXTENDED */
1113 if (!(ctx->cflags & REG_EXTENDED)
1114 && *(ctx->re + 1) == CHAR_LBRACE)
1123 /* "{" is literal without REG_EXTENDED */
1124 if (!(ctx->cflags & REG_EXTENDED))
1130 status = tre_parse_bound(ctx, &result);
1131 if (status != REG_OK)
1133 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1139 /* Parse an atom. An atom is a regular expression enclosed in `()',
1140 an empty set of `()', a bracket expression, `.', `^', `$',
1141 a `\' followed by a character, or a single character. */
1145 case CHAR_LPAREN: /* parenthesized subexpression */
1147 if (ctx->cflags & REG_EXTENDED)
1153 /* First parse a whole RE, then mark the resulting tree
1155 STACK_PUSHX(stack, int, ctx->submatch_id);
1156 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1157 STACK_PUSHX(stack, int, PARSE_RE);
1165 case CHAR_LBRACKET: /* bracket expression */
1167 status = tre_parse_bracket(ctx, &result);
1168 if (status != REG_OK)
1172 case CHAR_BACKSLASH:
1173 /* If this is "\(" or "\)" chew off the backslash and
1175 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1180 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
1185 /* If a macro is used, parse the expanded macro recursively. */
1187 const char *buf = tre_expand_macro(ctx->re + 1);
1190 tre_parse_ctx_t subctx;
1191 memcpy(&subctx, ctx, sizeof(subctx));
1193 subctx.nofirstsub = 1;
1194 status = tre_parse(&subctx);
1195 if (status != REG_OK)
1198 ctx->position = subctx.position;
1199 result = subctx.result;
1205 /* Trailing backslash. */
1212 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1217 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1218 ASSERT_AT_WB_NEG, -1);
1222 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1227 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1233 if (ctx->re[0] != CHAR_LBRACE)
1235 /* 8 bit hex char. */
1236 char tmp[3] = {0, 0, 0};
1239 if (tre_isxdigit(ctx->re[0]))
1241 tmp[0] = (char)ctx->re[0];
1244 if (tre_isxdigit(ctx->re[0]))
1246 tmp[1] = (char)ctx->re[0];
1249 val = strtol(tmp, NULL, 16);
1250 result = tre_ast_new_literal(ctx->mem, (int)val,
1251 (int)val, ctx->position);
1262 while (*ctx->re && i < sizeof tmp)
1264 if (ctx->re[0] == CHAR_RBRACE)
1266 if (tre_isxdigit(ctx->re[0]))
1268 tmp[i] = (char)ctx->re[0];
1277 val = strtol(tmp, NULL, 16);
1278 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1286 if (tre_isdigit(*ctx->re))
1288 /* Back reference. */
1289 int val = *ctx->re - '0';
1290 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1295 ctx->max_backref = MAX(val, ctx->max_backref);
1300 /* Escaped character. */
1301 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1312 case CHAR_PERIOD: /* the any-symbol */
1313 if (ctx->cflags & REG_NEWLINE)
1315 tre_ast_node_t *tmp1;
1316 tre_ast_node_t *tmp2;
1317 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1321 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1325 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1332 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1341 case CHAR_CARET: /* beginning of line assertion */
1342 /* '^' has a special meaning everywhere in EREs, and at
1343 beginning of BRE. */
1344 if (ctx->cflags & REG_EXTENDED
1345 || ctx->re == ctx->re_start)
1347 if (!(ctx->cflags & REG_EXTENDED))
1348 STACK_PUSHX(stack, int, PARSE_CATENATION);
1349 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1359 case CHAR_DOLLAR: /* end of line assertion. */
1360 /* '$' is special everywhere in EREs, and in the end of the
1362 if (ctx->cflags & REG_EXTENDED
1365 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1382 case CHAR_QUESTIONMARK:
1383 if (!(ctx->cflags & REG_EXTENDED))
1388 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1396 clen = mbtowc(&wc, ctx->re, -1);
1397 if (clen<0) clen=1, wc=WEOF;
1399 /* Note that we can't use an tre_isalpha() test here, since there
1400 may be characters which are alphabetic but neither upper or
1402 if (ctx->cflags & REG_ICASE
1403 && (tre_isupper(wc) || tre_islower(wc)))
1405 tre_ast_node_t *tmp1;
1406 tre_ast_node_t *tmp2;
1408 /* XXX - Can there be more than one opposite-case
1409 counterpoints for some character in some locale? Or
1410 more than two characters which all should be regarded
1411 the same character if case is ignored? If yes, there
1412 does not seem to be a portable way to detect it. I guess
1413 that at least for multi-character collating elements there
1414 could be several opposite-case counterpoints, but they
1415 cannot be supported portably anyway. */
1416 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1421 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1426 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1432 result = tre_ast_new_literal(ctx->mem, wc, wc,
1443 case PARSE_MARK_FOR_SUBMATCH:
1445 int submatch_id = tre_stack_pop_int(stack);
1447 if (result->submatch_id >= 0)
1449 tre_ast_node_t *n, *tmp_node;
1450 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1453 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1454 if (tmp_node == NULL)
1456 tmp_node->num_submatches = result->num_submatches;
1459 result->submatch_id = submatch_id;
1460 result->num_submatches++;
1464 case PARSE_RESTORE_CFLAGS:
1465 ctx->cflags = tre_stack_pop_int(stack);
1474 /* Check for missing closing parentheses. */
1478 if (status == REG_OK)
1479 ctx->result = result;
1486 /***********************************************************************
1488 ***********************************************************************/
1493 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1498 Algorithms to setup tags so that submatch addressing can be done.
1502 /* Inserts a catenation node to the root of the tree given in `node'.
1503 As the left child a new tag with number `tag_id' to `node' is added,
1504 and the right child is the old root. */
1505 static reg_errcode_t
1506 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1508 tre_catenation_t *c;
1510 c = tre_mem_alloc(mem, sizeof(*c));
1513 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1514 if (c->left == NULL)
1516 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1517 if (c->right == NULL)
1520 c->right->obj = node->obj;
1521 c->right->type = node->type;
1522 c->right->nullable = -1;
1523 c->right->submatch_id = -1;
1524 c->right->firstpos = NULL;
1525 c->right->lastpos = NULL;
1526 c->right->num_tags = 0;
1528 node->type = CATENATION;
1532 /* Inserts a catenation node to the root of the tree given in `node'.
1533 As the right child a new tag with number `tag_id' to `node' is added,
1534 and the left child is the old root. */
1535 static reg_errcode_t
1536 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1538 tre_catenation_t *c;
1540 c = tre_mem_alloc(mem, sizeof(*c));
1543 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1544 if (c->right == NULL)
1546 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1547 if (c->left == NULL)
1550 c->left->obj = node->obj;
1551 c->left->type = node->type;
1552 c->left->nullable = -1;
1553 c->left->submatch_id = -1;
1554 c->left->firstpos = NULL;
1555 c->left->lastpos = NULL;
1556 c->left->num_tags = 0;
1558 node->type = CATENATION;
1564 ADDTAGS_AFTER_ITERATION,
1565 ADDTAGS_AFTER_UNION_LEFT,
1566 ADDTAGS_AFTER_UNION_RIGHT,
1567 ADDTAGS_AFTER_CAT_LEFT,
1568 ADDTAGS_AFTER_CAT_RIGHT,
1569 ADDTAGS_SET_SUBMATCH_END
1570 } tre_addtags_symbol_t;
1579 /* Go through `regset' and set submatch data for submatches that are
1582 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1586 for (i = 0; regset[i] >= 0; i++)
1588 int id = regset[i] / 2;
1589 int start = !(regset[i] % 2);
1591 tnfa->submatch_data[id].so_tag = tag;
1593 tnfa->submatch_data[id].eo_tag = tag;
1599 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1600 subexpressions marked for submatch addressing can be traced. */
1601 static reg_errcode_t
1602 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1605 reg_errcode_t status = REG_OK;
1606 tre_addtags_symbol_t symbol;
1607 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1608 int bottom = tre_stack_num_objects(stack);
1609 /* True for first pass (counting number of needed tags) */
1610 int first_pass = (mem == NULL || tnfa == NULL);
1611 int *regset, *orig_regset;
1612 int num_tags = 0; /* Total number of tags. */
1613 int num_minimals = 0; /* Number of special minimal tags. */
1614 int tag = 0; /* The tag that is to be added next. */
1615 int next_tag = 1; /* Next tag to use after this one. */
1616 int *parents; /* Stack of submatches the current submatch is
1618 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1619 tre_tag_states_t *saved_states;
1621 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1625 tnfa->minimal_tags[0] = -1;
1628 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1632 orig_regset = regset;
1634 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1635 if (parents == NULL)
1642 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1643 if (saved_states == NULL)
1652 for (i = 0; i <= tnfa->num_submatches; i++)
1653 saved_states[i].tag = -1;
1656 STACK_PUSH(stack, voidptr, node);
1657 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1659 while (tre_stack_num_objects(stack) > bottom)
1661 if (status != REG_OK)
1664 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1668 case ADDTAGS_SET_SUBMATCH_END:
1670 int id = tre_stack_pop_int(stack);
1673 /* Add end of this submatch to regset. */
1674 for (i = 0; regset[i] >= 0; i++);
1675 regset[i] = id * 2 + 1;
1678 /* Pop this submatch from the parents stack. */
1679 for (i = 0; parents[i] >= 0; i++);
1680 parents[i - 1] = -1;
1684 case ADDTAGS_RECURSE:
1685 node = tre_stack_pop_voidptr(stack);
1687 if (node->submatch_id >= 0)
1689 int id = node->submatch_id;
1693 /* Add start of this submatch to regset. */
1694 for (i = 0; regset[i] >= 0; i++);
1700 for (i = 0; parents[i] >= 0; i++);
1701 tnfa->submatch_data[id].parents = NULL;
1704 int *p = xmalloc(sizeof(*p) * (i + 1));
1707 status = REG_ESPACE;
1710 assert(tnfa->submatch_data[id].parents == NULL);
1711 tnfa->submatch_data[id].parents = p;
1712 for (i = 0; parents[i] >= 0; i++)
1718 /* Add end of this submatch to regset after processing this
1720 STACK_PUSHX(stack, int, node->submatch_id);
1721 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1728 tre_literal_t *lit = node->obj;
1730 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1735 /* Regset is not empty, so add a tag before the
1736 literal or backref. */
1739 status = tre_add_tag_left(mem, node, tag);
1740 tnfa->tag_directions[tag] = direction;
1741 if (minimal_tag >= 0)
1743 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1744 tnfa->minimal_tags[i] = tag;
1745 tnfa->minimal_tags[i + 1] = minimal_tag;
1746 tnfa->minimal_tags[i + 2] = -1;
1750 tre_purge_regset(regset, tnfa, tag);
1765 assert(!IS_TAG(lit));
1771 tre_catenation_t *cat = node->obj;
1772 tre_ast_node_t *left = cat->left;
1773 tre_ast_node_t *right = cat->right;
1774 int reserved_tag = -1;
1777 /* After processing right child. */
1778 STACK_PUSHX(stack, voidptr, node);
1779 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1781 /* Process right child. */
1782 STACK_PUSHX(stack, voidptr, right);
1783 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1785 /* After processing left child. */
1786 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1787 if (left->num_tags > 0 && right->num_tags > 0)
1789 /* Reserve the next tag to the right child. */
1790 reserved_tag = next_tag;
1793 STACK_PUSHX(stack, int, reserved_tag);
1794 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1796 /* Process left child. */
1797 STACK_PUSHX(stack, voidptr, left);
1798 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1804 tre_iteration_t *iter = node->obj;
1808 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1812 STACK_PUSHX(stack, int, tag);
1813 STACK_PUSHX(stack, int, iter->minimal);
1815 STACK_PUSHX(stack, voidptr, node);
1816 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1818 STACK_PUSHX(stack, voidptr, iter->arg);
1819 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1821 /* Regset is not empty, so add a tag here. */
1822 if (regset[0] >= 0 || iter->minimal)
1827 status = tre_add_tag_left(mem, node, tag);
1829 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1831 tnfa->tag_directions[tag] = direction;
1832 if (minimal_tag >= 0)
1834 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1835 tnfa->minimal_tags[i] = tag;
1836 tnfa->minimal_tags[i + 1] = minimal_tag;
1837 tnfa->minimal_tags[i + 2] = -1;
1841 tre_purge_regset(regset, tnfa, tag);
1849 direction = TRE_TAG_MINIMIZE;
1854 tre_union_t *uni = node->obj;
1855 tre_ast_node_t *left = uni->left;
1856 tre_ast_node_t *right = uni->right;
1862 left_tag = next_tag;
1863 right_tag = next_tag + 1;
1868 right_tag = next_tag;
1871 /* After processing right child. */
1872 STACK_PUSHX(stack, int, right_tag);
1873 STACK_PUSHX(stack, int, left_tag);
1874 STACK_PUSHX(stack, voidptr, regset);
1875 STACK_PUSHX(stack, int, regset[0] >= 0);
1876 STACK_PUSHX(stack, voidptr, node);
1877 STACK_PUSHX(stack, voidptr, right);
1878 STACK_PUSHX(stack, voidptr, left);
1879 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1881 /* Process right child. */
1882 STACK_PUSHX(stack, voidptr, right);
1883 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1885 /* After processing left child. */
1886 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1888 /* Process left child. */
1889 STACK_PUSHX(stack, voidptr, left);
1890 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1892 /* Regset is not empty, so add a tag here. */
1898 status = tre_add_tag_left(mem, node, tag);
1899 tnfa->tag_directions[tag] = direction;
1900 if (minimal_tag >= 0)
1902 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1903 tnfa->minimal_tags[i] = tag;
1904 tnfa->minimal_tags[i + 1] = minimal_tag;
1905 tnfa->minimal_tags[i + 2] = -1;
1909 tre_purge_regset(regset, tnfa, tag);
1918 if (node->num_submatches > 0)
1920 /* The next two tags are reserved for markers. */
1930 if (node->submatch_id >= 0)
1933 /* Push this submatch on the parents stack. */
1934 for (i = 0; parents[i] >= 0; i++);
1935 parents[i] = node->submatch_id;
1936 parents[i + 1] = -1;
1939 break; /* end case: ADDTAGS_RECURSE */
1941 case ADDTAGS_AFTER_ITERATION:
1945 node = tre_stack_pop_voidptr(stack);
1948 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1949 + tre_stack_pop_int(stack);
1954 minimal = tre_stack_pop_int(stack);
1955 enter_tag = tre_stack_pop_int(stack);
1957 minimal_tag = enter_tag;
1963 direction = TRE_TAG_MINIMIZE;
1965 direction = TRE_TAG_MAXIMIZE;
1970 case ADDTAGS_AFTER_CAT_LEFT:
1972 int new_tag = tre_stack_pop_int(stack);
1973 next_tag = tre_stack_pop_int(stack);
1981 case ADDTAGS_AFTER_CAT_RIGHT:
1982 node = tre_stack_pop_voidptr(stack);
1984 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1985 + ((tre_catenation_t *)node->obj)->right->num_tags;
1988 case ADDTAGS_AFTER_UNION_LEFT:
1989 /* Lift the bottom of the `regset' array so that when processing
1990 the right operand the items currently in the array are
1991 invisible. The original bottom was saved at ADDTAGS_UNION and
1992 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1993 while (*regset >= 0)
1997 case ADDTAGS_AFTER_UNION_RIGHT:
1999 int added_tags, tag_left, tag_right;
2000 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2001 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2002 node = tre_stack_pop_voidptr(stack);
2003 added_tags = tre_stack_pop_int(stack);
2006 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2007 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2008 + ((node->num_submatches > 0) ? 2 : 0);
2010 regset = tre_stack_pop_voidptr(stack);
2011 tag_left = tre_stack_pop_int(stack);
2012 tag_right = tre_stack_pop_int(stack);
2014 /* Add tags after both children, the left child gets a smaller
2015 tag than the right child. This guarantees that we prefer
2016 the left child over the right child. */
2017 /* XXX - This is not always necessary (if the children have
2018 tags which must be seen for every match of that child). */
2019 /* XXX - Check if this is the only place where tre_add_tag_right
2020 is used. If so, use tre_add_tag_left (putting the tag before
2021 the child as opposed after the child) and throw away
2022 tre_add_tag_right. */
2023 if (node->num_submatches > 0)
2027 status = tre_add_tag_right(mem, left, tag_left);
2028 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2029 status = tre_add_tag_right(mem, right, tag_right);
2030 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2034 direction = TRE_TAG_MAXIMIZE;
2042 } /* end switch(symbol) */
2043 } /* end while(tre_stack_num_objects(stack) > bottom) */
2046 tre_purge_regset(regset, tnfa, tag);
2048 if (!first_pass && minimal_tag >= 0)
2051 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2052 tnfa->minimal_tags[i] = tag;
2053 tnfa->minimal_tags[i + 1] = minimal_tag;
2054 tnfa->minimal_tags[i + 2] = -1;
2059 assert(tree->num_tags == num_tags);
2060 tnfa->end_tag = num_tags;
2061 tnfa->num_tags = num_tags;
2062 tnfa->num_minimals = num_minimals;
2065 xfree(saved_states);
2072 AST to TNFA compilation routines.
2078 } tre_copyast_symbol_t;
2080 /* Flags for tre_copy_ast(). */
2081 #define COPY_REMOVE_TAGS 1
2082 #define COPY_MAXIMIZE_FIRST_TAG 2
2084 static reg_errcode_t
2085 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2086 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2087 tre_ast_node_t **copy, int *max_pos)
2089 reg_errcode_t status = REG_OK;
2090 int bottom = tre_stack_num_objects(stack);
2093 tre_ast_node_t **result = copy;
2094 tre_copyast_symbol_t symbol;
2096 STACK_PUSH(stack, voidptr, ast);
2097 STACK_PUSH(stack, int, COPY_RECURSE);
2099 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2101 tre_ast_node_t *node;
2102 if (status != REG_OK)
2105 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2108 case COPY_SET_RESULT_PTR:
2109 result = tre_stack_pop_voidptr(stack);
2112 node = tre_stack_pop_voidptr(stack);
2117 tre_literal_t *lit = node->obj;
2118 int pos = lit->position;
2119 int min = lit->code_min;
2120 int max = lit->code_max;
2121 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2123 /* XXX - e.g. [ab] has only one position but two
2124 nodes, so we are creating holes in the state space
2125 here. Not fatal, just wastes memory. */
2129 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2131 /* Change this tag to empty. */
2135 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2138 /* Maximize the first tag. */
2139 tag_directions[max] = TRE_TAG_MAXIMIZE;
2142 *result = tre_ast_new_literal(mem, min, max, pos);
2143 if (*result == NULL)
2144 status = REG_ESPACE;
2152 tre_union_t *uni = node->obj;
2154 *result = tre_ast_new_union(mem, uni->left, uni->right);
2155 if (*result == NULL)
2157 status = REG_ESPACE;
2160 tmp = (*result)->obj;
2161 result = &tmp->left;
2162 STACK_PUSHX(stack, voidptr, uni->right);
2163 STACK_PUSHX(stack, int, COPY_RECURSE);
2164 STACK_PUSHX(stack, voidptr, &tmp->right);
2165 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2166 STACK_PUSHX(stack, voidptr, uni->left);
2167 STACK_PUSHX(stack, int, COPY_RECURSE);
2172 tre_catenation_t *cat = node->obj;
2173 tre_catenation_t *tmp;
2174 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2175 if (*result == NULL)
2177 status = REG_ESPACE;
2180 tmp = (*result)->obj;
2183 result = &tmp->left;
2185 STACK_PUSHX(stack, voidptr, cat->right);
2186 STACK_PUSHX(stack, int, COPY_RECURSE);
2187 STACK_PUSHX(stack, voidptr, &tmp->right);
2188 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2189 STACK_PUSHX(stack, voidptr, cat->left);
2190 STACK_PUSHX(stack, int, COPY_RECURSE);
2195 tre_iteration_t *iter = node->obj;
2196 STACK_PUSHX(stack, voidptr, iter->arg);
2197 STACK_PUSHX(stack, int, COPY_RECURSE);
2198 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2199 iter->max, iter->minimal);
2200 if (*result == NULL)
2202 status = REG_ESPACE;
2205 iter = (*result)->obj;
2206 result = &iter->arg;
2216 *pos_add += num_copied;
2223 } tre_expand_ast_symbol_t;
2225 /* Expands each iteration node that has a finite nonzero minimum or maximum
2226 iteration count to a catenated sequence of copies of the node. */
2227 static reg_errcode_t
2228 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2229 int *position, tre_tag_direction_t *tag_directions)
2231 reg_errcode_t status = REG_OK;
2232 int bottom = tre_stack_num_objects(stack);
2234 int pos_add_total = 0;
2238 STACK_PUSHR(stack, voidptr, ast);
2239 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2240 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2242 tre_ast_node_t *node;
2243 tre_expand_ast_symbol_t symbol;
2245 if (status != REG_OK)
2248 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2249 node = tre_stack_pop_voidptr(stack);
2252 case EXPAND_RECURSE:
2257 tre_literal_t *lit= node->obj;
2258 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2260 lit->position += pos_add;
2261 if (lit->position > max_pos)
2262 max_pos = lit->position;
2268 tre_union_t *uni = node->obj;
2269 STACK_PUSHX(stack, voidptr, uni->right);
2270 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2271 STACK_PUSHX(stack, voidptr, uni->left);
2272 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2277 tre_catenation_t *cat = node->obj;
2278 STACK_PUSHX(stack, voidptr, cat->right);
2279 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2280 STACK_PUSHX(stack, voidptr, cat->left);
2281 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2286 tre_iteration_t *iter = node->obj;
2287 STACK_PUSHX(stack, int, pos_add);
2288 STACK_PUSHX(stack, voidptr, node);
2289 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2290 STACK_PUSHX(stack, voidptr, iter->arg);
2291 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2292 /* If we are going to expand this node at EXPAND_AFTER_ITER
2293 then don't increase the `pos' fields of the nodes now, it
2294 will get done when expanding. */
2295 if (iter->min > 1 || iter->max > 1)
2305 case EXPAND_AFTER_ITER:
2307 tre_iteration_t *iter = node->obj;
2309 pos_add = tre_stack_pop_int(stack);
2310 pos_add_last = pos_add;
2311 if (iter->min > 1 || iter->max > 1)
2313 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2315 int pos_add_save = pos_add;
2317 /* Create a catenated sequence of copies of the node. */
2318 for (j = 0; j < iter->min; j++)
2320 tre_ast_node_t *copy;
2321 /* Remove tags from all but the last copy. */
2322 int flags = ((j + 1 < iter->min)
2324 : COPY_MAXIMIZE_FIRST_TAG);
2325 pos_add_save = pos_add;
2326 status = tre_copy_ast(mem, stack, iter->arg, flags,
2327 &pos_add, tag_directions, ©,
2329 if (status != REG_OK)
2332 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2339 if (iter->max == -1)
2341 /* No upper limit. */
2342 pos_add_save = pos_add;
2343 status = tre_copy_ast(mem, stack, iter->arg, 0,
2344 &pos_add, NULL, &seq2, &max_pos);
2345 if (status != REG_OK)
2347 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2353 for (j = iter->min; j < iter->max; j++)
2355 tre_ast_node_t *tmp, *copy;
2356 pos_add_save = pos_add;
2357 status = tre_copy_ast(mem, stack, iter->arg, 0,
2358 &pos_add, NULL, ©, &max_pos);
2359 if (status != REG_OK)
2362 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2367 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2370 seq2 = tre_ast_new_union(mem, tmp, seq2);
2376 pos_add = pos_add_save;
2379 else if (seq2 != NULL)
2380 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2383 node->obj = seq1->obj;
2384 node->type = seq1->type;
2388 pos_add_total += pos_add - pos_add_last;
2389 if (iter_depth == 0)
2390 pos_add = pos_add_total;
2400 *position += pos_add_total;
2402 /* `max_pos' should never be larger than `*position' if the above
2403 code works, but just an extra safeguard let's make sure
2404 `*position' is set large enough so enough memory will be
2405 allocated for the transition table. */
2406 if (max_pos > *position)
2407 *position = max_pos;
2412 static tre_pos_and_tags_t *
2413 tre_set_empty(tre_mem_t mem)
2415 tre_pos_and_tags_t *new_set;
2417 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2418 if (new_set == NULL)
2421 new_set[0].position = -1;
2422 new_set[0].code_min = -1;
2423 new_set[0].code_max = -1;
2428 static tre_pos_and_tags_t *
2429 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2430 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2432 tre_pos_and_tags_t *new_set;
2434 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2435 if (new_set == NULL)
2438 new_set[0].position = position;
2439 new_set[0].code_min = code_min;
2440 new_set[0].code_max = code_max;
2441 new_set[0].class = class;
2442 new_set[0].neg_classes = neg_classes;
2443 new_set[0].backref = backref;
2444 new_set[1].position = -1;
2445 new_set[1].code_min = -1;
2446 new_set[1].code_max = -1;
2451 static tre_pos_and_tags_t *
2452 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2453 int *tags, int assertions)
2456 tre_pos_and_tags_t *new_set;
2460 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2461 for (s1 = 0; set1[s1].position >= 0; s1++);
2462 for (s2 = 0; set2[s2].position >= 0; s2++);
2463 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2467 for (s1 = 0; set1[s1].position >= 0; s1++)
2469 new_set[s1].position = set1[s1].position;
2470 new_set[s1].code_min = set1[s1].code_min;
2471 new_set[s1].code_max = set1[s1].code_max;
2472 new_set[s1].assertions = set1[s1].assertions | assertions;
2473 new_set[s1].class = set1[s1].class;
2474 new_set[s1].neg_classes = set1[s1].neg_classes;
2475 new_set[s1].backref = set1[s1].backref;
2476 if (set1[s1].tags == NULL && tags == NULL)
2477 new_set[s1].tags = NULL;
2480 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2481 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2482 * (i + num_tags + 1)));
2483 if (new_tags == NULL)
2485 for (j = 0; j < i; j++)
2486 new_tags[j] = set1[s1].tags[j];
2487 for (i = 0; i < num_tags; i++)
2488 new_tags[j + i] = tags[i];
2489 new_tags[j + i] = -1;
2490 new_set[s1].tags = new_tags;
2494 for (s2 = 0; set2[s2].position >= 0; s2++)
2496 new_set[s1 + s2].position = set2[s2].position;
2497 new_set[s1 + s2].code_min = set2[s2].code_min;
2498 new_set[s1 + s2].code_max = set2[s2].code_max;
2499 /* XXX - why not | assertions here as well? */
2500 new_set[s1 + s2].assertions = set2[s2].assertions;
2501 new_set[s1 + s2].class = set2[s2].class;
2502 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2503 new_set[s1 + s2].backref = set2[s2].backref;
2504 if (set2[s2].tags == NULL)
2505 new_set[s1 + s2].tags = NULL;
2508 for (i = 0; set2[s2].tags[i] >= 0; i++);
2509 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2510 if (new_tags == NULL)
2512 for (j = 0; j < i; j++)
2513 new_tags[j] = set2[s2].tags[j];
2515 new_set[s1 + s2].tags = new_tags;
2518 new_set[s1 + s2].position = -1;
2522 /* Finds the empty path through `node' which is the one that should be
2523 taken according to POSIX.2 rules, and adds the tags on that path to
2524 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2525 set to the number of tags seen on the path. */
2526 static reg_errcode_t
2527 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2528 int *assertions, int *num_tags_seen)
2532 tre_catenation_t *cat;
2533 tre_iteration_t *iter;
2535 int bottom = tre_stack_num_objects(stack);
2536 reg_errcode_t status = REG_OK;
2540 status = tre_stack_push_voidptr(stack, node);
2542 /* Walk through the tree recursively. */
2543 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2545 node = tre_stack_pop_voidptr(stack);
2550 lit = (tre_literal_t *)node->obj;
2551 switch (lit->code_min)
2554 if (lit->code_max >= 0)
2558 /* Add the tag to `tags'. */
2559 for (i = 0; tags[i] >= 0; i++)
2560 if (tags[i] == lit->code_max)
2564 tags[i] = lit->code_max;
2573 assert(lit->code_max >= 1
2574 || lit->code_max <= ASSERT_LAST);
2575 if (assertions != NULL)
2576 *assertions |= lit->code_max;
2587 /* Subexpressions starting earlier take priority over ones
2588 starting later, so we prefer the left subexpression over the
2589 right subexpression. */
2590 uni = (tre_union_t *)node->obj;
2591 if (uni->left->nullable)
2592 STACK_PUSHX(stack, voidptr, uni->left)
2593 else if (uni->right->nullable)
2594 STACK_PUSHX(stack, voidptr, uni->right)
2600 /* The path must go through both children. */
2601 cat = (tre_catenation_t *)node->obj;
2602 assert(cat->left->nullable);
2603 assert(cat->right->nullable);
2604 STACK_PUSHX(stack, voidptr, cat->left);
2605 STACK_PUSHX(stack, voidptr, cat->right);
2609 /* A match with an empty string is preferred over no match at
2610 all, so we go through the argument if possible. */
2611 iter = (tre_iteration_t *)node->obj;
2612 if (iter->arg->nullable)
2613 STACK_PUSHX(stack, voidptr, iter->arg);
2629 NFL_POST_CATENATION,
2631 } tre_nfl_stack_symbol_t;
2634 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2635 the nodes of the AST `tree'. */
2636 static reg_errcode_t
2637 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2639 int bottom = tre_stack_num_objects(stack);
2641 STACK_PUSHR(stack, voidptr, tree);
2642 STACK_PUSHR(stack, int, NFL_RECURSE);
2644 while (tre_stack_num_objects(stack) > bottom)
2646 tre_nfl_stack_symbol_t symbol;
2647 tre_ast_node_t *node;
2649 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2650 node = tre_stack_pop_voidptr(stack);
2658 tre_literal_t *lit = (tre_literal_t *)node->obj;
2659 if (IS_BACKREF(lit))
2661 /* Back references: nullable = false, firstpos = {i},
2664 node->firstpos = tre_set_one(mem, lit->position, 0,
2665 TRE_CHAR_MAX, 0, NULL, -1);
2666 if (!node->firstpos)
2668 node->lastpos = tre_set_one(mem, lit->position, 0,
2669 TRE_CHAR_MAX, 0, NULL,
2670 (int)lit->code_max);
2674 else if (lit->code_min < 0)
2676 /* Tags, empty strings, params, and zero width assertions:
2677 nullable = true, firstpos = {}, and lastpos = {}. */
2679 node->firstpos = tre_set_empty(mem);
2680 if (!node->firstpos)
2682 node->lastpos = tre_set_empty(mem);
2688 /* Literal at position i: nullable = false, firstpos = {i},
2692 tre_set_one(mem, lit->position, (int)lit->code_min,
2693 (int)lit->code_max, 0, NULL, -1);
2694 if (!node->firstpos)
2696 node->lastpos = tre_set_one(mem, lit->position,
2699 lit->class, lit->neg_classes,
2708 /* Compute the attributes for the two subtrees, and after that
2710 STACK_PUSHR(stack, voidptr, node);
2711 STACK_PUSHR(stack, int, NFL_POST_UNION);
2712 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2713 STACK_PUSHR(stack, int, NFL_RECURSE);
2714 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2715 STACK_PUSHR(stack, int, NFL_RECURSE);
2719 /* Compute the attributes for the two subtrees, and after that
2721 STACK_PUSHR(stack, voidptr, node);
2722 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2723 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2724 STACK_PUSHR(stack, int, NFL_RECURSE);
2725 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2726 STACK_PUSHR(stack, int, NFL_RECURSE);
2730 /* Compute the attributes for the subtree, and after that for
2732 STACK_PUSHR(stack, voidptr, node);
2733 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2734 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2735 STACK_PUSHR(stack, int, NFL_RECURSE);
2738 break; /* end case: NFL_RECURSE */
2740 case NFL_POST_UNION:
2742 tre_union_t *uni = (tre_union_t *)node->obj;
2743 node->nullable = uni->left->nullable || uni->right->nullable;
2744 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2745 uni->right->firstpos, NULL, 0);
2746 if (!node->firstpos)
2748 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2749 uni->right->lastpos, NULL, 0);
2755 case NFL_POST_ITERATION:
2757 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2759 if (iter->min == 0 || iter->arg->nullable)
2763 node->firstpos = iter->arg->firstpos;
2764 node->lastpos = iter->arg->lastpos;
2768 case NFL_POST_CATENATION:
2770 int num_tags, *tags, assertions;
2771 reg_errcode_t status;
2772 tre_catenation_t *cat = node->obj;
2773 node->nullable = cat->left->nullable && cat->right->nullable;
2775 /* Compute firstpos. */
2776 if (cat->left->nullable)
2778 /* The left side matches the empty string. Make a first pass
2779 with tre_match_empty() to get the number of tags and
2781 status = tre_match_empty(stack, cat->left,
2782 NULL, NULL, &num_tags);
2783 if (status != REG_OK)
2785 /* Allocate arrays for the tags and parameters. */
2786 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2791 /* Second pass with tre_mach_empty() to get the list of
2792 tags and parameters. */
2793 status = tre_match_empty(stack, cat->left, tags,
2795 if (status != REG_OK)
2801 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2804 if (!node->firstpos)
2809 node->firstpos = cat->left->firstpos;
2812 /* Compute lastpos. */
2813 if (cat->right->nullable)
2815 /* The right side matches the empty string. Make a first pass
2816 with tre_match_empty() to get the number of tags and
2818 status = tre_match_empty(stack, cat->right,
2819 NULL, NULL, &num_tags);
2820 if (status != REG_OK)
2822 /* Allocate arrays for the tags and parameters. */
2823 tags = xmalloc(sizeof(int) * (num_tags + 1));
2828 /* Second pass with tre_mach_empty() to get the list of
2829 tags and parameters. */
2830 status = tre_match_empty(stack, cat->right, tags,
2832 if (status != REG_OK)
2838 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2846 node->lastpos = cat->right->lastpos;
2861 /* Adds a transition from each position in `p1' to each position in `p2'. */
2862 static reg_errcode_t
2863 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2864 tre_tnfa_transition_t *transitions,
2865 int *counts, int *offs)
2867 tre_pos_and_tags_t *orig_p2 = p2;
2868 tre_tnfa_transition_t *trans;
2869 int i, j, k, l, dup, prev_p2_pos;
2871 if (transitions != NULL)
2872 while (p1->position >= 0)
2876 while (p2->position >= 0)
2878 /* Optimization: if this position was already handled, skip it. */
2879 if (p2->position == prev_p2_pos)
2884 prev_p2_pos = p2->position;
2885 /* Set `trans' to point to the next unused transition from
2886 position `p1->position'. */
2887 trans = transitions + offs[p1->position];
2888 while (trans->state != NULL)
2891 /* If we find a previous transition from `p1->position' to
2892 `p2->position', it is overwritten. This can happen only
2893 if there are nested loops in the regexp, like in "((a)*)*".
2894 In POSIX.2 repetition using the outer loop is always
2895 preferred over using the inner loop. Therefore the
2896 transition for the inner loop is useless and can be thrown
2898 /* XXX - The same position is used for all nodes in a bracket
2899 expression, so this optimization cannot be used (it will
2900 break bracket expressions) unless I figure out a way to
2902 if (trans->state_id == p2->position)
2910 if (trans->state == NULL)
2911 (trans + 1)->state = NULL;
2912 /* Use the character ranges, assertions, etc. from `p1' for
2913 the transition from `p1' to `p2'. */
2914 trans->code_min = p1->code_min;
2915 trans->code_max = p1->code_max;
2916 trans->state = transitions + offs[p2->position];
2917 trans->state_id = p2->position;
2918 trans->assertions = p1->assertions | p2->assertions
2919 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2920 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2921 if (p1->backref >= 0)
2923 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2924 assert(p2->backref < 0);
2925 trans->u.backref = p1->backref;
2926 trans->assertions |= ASSERT_BACKREF;
2929 trans->u.class = p1->class;
2930 if (p1->neg_classes != NULL)
2932 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2933 trans->neg_classes =
2934 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2935 if (trans->neg_classes == NULL)
2937 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2938 trans->neg_classes[i] = p1->neg_classes[i];
2939 trans->neg_classes[i] = (tre_ctype_t)0;
2942 trans->neg_classes = NULL;
2944 /* Find out how many tags this transition has. */
2946 if (p1->tags != NULL)
2947 while(p1->tags[i] >= 0)
2950 if (p2->tags != NULL)
2951 while(p2->tags[j] >= 0)
2954 /* If we are overwriting a transition, free the old tag array. */
2955 if (trans->tags != NULL)
2959 /* If there were any tags, allocate an array and fill it. */
2962 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2966 if (p1->tags != NULL)
2967 while(p1->tags[i] >= 0)
2969 trans->tags[i] = p1->tags[i];
2974 if (p2->tags != NULL)
2975 while (p2->tags[j] >= 0)
2977 /* Don't add duplicates. */
2979 for (k = 0; k < i; k++)
2980 if (trans->tags[k] == p2->tags[j])
2986 trans->tags[l++] = p2->tags[j];
2989 trans->tags[l] = -1;
2997 /* Compute a maximum limit for the number of transitions leaving
2999 while (p1->position >= 0)
3002 while (p2->position >= 0)
3004 counts[p1->position]++;
3012 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3013 labelled with one character range (there are no transitions on empty
3014 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3016 static reg_errcode_t
3017 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3018 int *counts, int *offs)
3021 tre_catenation_t *cat;
3022 tre_iteration_t *iter;
3023 reg_errcode_t errcode = REG_OK;
3025 /* XXX - recurse using a stack!. */
3031 uni = (tre_union_t *)node->obj;
3032 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3033 if (errcode != REG_OK)
3035 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3039 cat = (tre_catenation_t *)node->obj;
3040 /* Add a transition from each position in cat->left->lastpos
3041 to each position in cat->right->firstpos. */
3042 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3043 transitions, counts, offs);
3044 if (errcode != REG_OK)
3046 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3047 if (errcode != REG_OK)
3049 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3053 iter = (tre_iteration_t *)node->obj;
3054 assert(iter->max == -1 || iter->max == 1);
3056 if (iter->max == -1)
3058 assert(iter->min == 0 || iter->min == 1);
3059 /* Add a transition from each last position in the iterated
3060 expression to each first position. */
3061 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3062 transitions, counts, offs);
3063 if (errcode != REG_OK)
3066 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3073 #define ERROR_EXIT(err) \
3077 if (/*CONSTCOND*/1) \
3080 while (/*CONSTCOND*/0)
3084 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
3087 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3088 tre_pos_and_tags_t *p;
3089 int *counts = NULL, *offs = NULL;
3091 tre_tnfa_transition_t *transitions, *initial;
3092 tre_tnfa_t *tnfa = NULL;
3093 tre_submatch_data_t *submatch_data;
3094 tre_tag_direction_t *tag_directions = NULL;
3095 reg_errcode_t errcode;
3098 /* Parse context. */
3099 tre_parse_ctx_t parse_ctx;
3101 /* Allocate a stack used throughout the compilation process for various
3103 stack = tre_stack_new(512, 10240, 128);
3106 /* Allocate a fast memory allocator. */
3107 mem = tre_mem_new();
3110 tre_stack_destroy(stack);
3114 /* Parse the regexp. */
3115 memset(&parse_ctx, 0, sizeof(parse_ctx));
3116 parse_ctx.mem = mem;
3117 parse_ctx.stack = stack;
3118 parse_ctx.re = regex;
3119 parse_ctx.cflags = cflags;
3120 parse_ctx.max_backref = -1;
3121 errcode = tre_parse(&parse_ctx);
3122 if (errcode != REG_OK)
3123 ERROR_EXIT(errcode);
3124 preg->re_nsub = parse_ctx.submatch_id - 1;
3125 tree = parse_ctx.result;
3127 /* Back references and approximate matching cannot currently be used
3128 in the same regexp. */
3129 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3130 ERROR_EXIT(REG_BADPAT);
3133 tre_ast_print(tree);
3134 #endif /* TRE_DEBUG */
3136 /* Referring to nonexistent subexpressions is illegal. */
3137 if (parse_ctx.max_backref > (int)preg->re_nsub)
3138 ERROR_EXIT(REG_ESUBREG);
3140 /* Allocate the TNFA struct. */
3141 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3143 ERROR_EXIT(REG_ESPACE);
3144 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3145 tnfa->have_approx = parse_ctx.have_approx;
3146 tnfa->num_submatches = parse_ctx.submatch_id;
3148 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3149 regexp does not have back references, this can be skipped. */
3150 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3153 /* Figure out how many tags we will need. */
3154 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3155 if (errcode != REG_OK)
3156 ERROR_EXIT(errcode);
3158 if (tnfa->num_tags > 0)
3160 tag_directions = xmalloc(sizeof(*tag_directions)
3161 * (tnfa->num_tags + 1));
3162 if (tag_directions == NULL)
3163 ERROR_EXIT(REG_ESPACE);
3164 tnfa->tag_directions = tag_directions;
3165 memset(tag_directions, -1,
3166 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3168 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3169 sizeof(*tnfa->minimal_tags));
3170 if (tnfa->minimal_tags == NULL)
3171 ERROR_EXIT(REG_ESPACE);
3173 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3174 sizeof(*submatch_data));
3175 if (submatch_data == NULL)
3176 ERROR_EXIT(REG_ESPACE);
3177 tnfa->submatch_data = submatch_data;
3179 errcode = tre_add_tags(mem, stack, tree, tnfa);
3180 if (errcode != REG_OK)
3181 ERROR_EXIT(errcode);
3185 /* Expand iteration nodes. */
3186 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3188 if (errcode != REG_OK)
3189 ERROR_EXIT(errcode);
3191 /* Add a dummy node for the final state.
3192 XXX - For certain patterns this dummy node can be optimized away,
3193 for example "a*" or "ab*". Figure out a simple way to detect
3194 this possibility. */
3196 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3197 if (tmp_ast_r == NULL)
3198 ERROR_EXIT(REG_ESPACE);
3200 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3202 ERROR_EXIT(REG_ESPACE);
3204 errcode = tre_compute_nfl(mem, stack, tree);
3205 if (errcode != REG_OK)
3206 ERROR_EXIT(errcode);
3208 counts = xmalloc(sizeof(int) * parse_ctx.position);
3210 ERROR_EXIT(REG_ESPACE);
3212 offs = xmalloc(sizeof(int) * parse_ctx.position);
3214 ERROR_EXIT(REG_ESPACE);
3216 for (i = 0; i < parse_ctx.position; i++)
3218 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3221 for (i = 0; i < parse_ctx.position; i++)
3224 add += counts[i] + 1;
3227 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3228 if (transitions == NULL)
3229 ERROR_EXIT(REG_ESPACE);
3230 tnfa->transitions = transitions;
3231 tnfa->num_transitions = add;
3233 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3234 if (errcode != REG_OK)
3235 ERROR_EXIT(errcode);
3237 tnfa->firstpos_chars = NULL;
3241 while (p->position >= 0)
3247 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3248 if (initial == NULL)
3249 ERROR_EXIT(REG_ESPACE);
3250 tnfa->initial = initial;
3253 for (p = tree->firstpos; p->position >= 0; p++)
3255 initial[i].state = transitions + offs[p->position];
3256 initial[i].state_id = p->position;
3257 initial[i].tags = NULL;
3258 /* Copy the arrays p->tags, and p->params, they are allocated
3259 from a tre_mem object. */
3263 for (j = 0; p->tags[j] >= 0; j++);
3264 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3265 if (!initial[i].tags)
3266 ERROR_EXIT(REG_ESPACE);
3267 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3269 initial[i].assertions = p->assertions;
3272 initial[i].state = NULL;
3274 tnfa->num_transitions = add;
3275 tnfa->final = transitions + offs[tree->lastpos[0].position];
3276 tnfa->num_states = parse_ctx.position;
3277 tnfa->cflags = cflags;
3279 tre_mem_destroy(mem);
3280 tre_stack_destroy(stack);
3284 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3288 /* Free everything that was allocated and return the error code. */
3289 tre_mem_destroy(mem);
3291 tre_stack_destroy(stack);
3296 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3305 regfree(regex_t *preg)
3309 tre_tnfa_transition_t *trans;
3311 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3315 for (i = 0; i < tnfa->num_transitions; i++)
3316 if (tnfa->transitions[i].state)
3318 if (tnfa->transitions[i].tags)
3319 xfree(tnfa->transitions[i].tags);
3320 if (tnfa->transitions[i].neg_classes)
3321 xfree(tnfa->transitions[i].neg_classes);
3323 if (tnfa->transitions)
3324 xfree(tnfa->transitions);
3328 for (trans = tnfa->initial; trans->state; trans++)
3333 xfree(tnfa->initial);
3336 if (tnfa->submatch_data)
3338 for (i = 0; i < tnfa->num_submatches; i++)
3339 if (tnfa->submatch_data[i].parents)
3340 xfree(tnfa->submatch_data[i].parents);
3341 xfree(tnfa->submatch_data);
3344 if (tnfa->tag_directions)
3345 xfree(tnfa->tag_directions);
3346 if (tnfa->firstpos_chars)
3347 xfree(tnfa->firstpos_chars);
3348 if (tnfa->minimal_tags)
3349 xfree(tnfa->minimal_tags);