2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
59 /***********************************************************************
60 from tre-ast.c and tre-ast.h
61 ***********************************************************************/
63 /* The different AST node types. */
71 /* Special subtypes of TRE_LITERAL. */
72 #define EMPTY -1 /* Empty leaf (denotes empty string). */
73 #define ASSERTION -2 /* Assertion leaf. */
74 #define TAG -3 /* Tag leaf. */
75 #define BACKREF -4 /* Back reference leaf. */
77 #define IS_SPECIAL(x) ((x)->code_min < 0)
78 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80 #define IS_TAG(x) ((x)->code_min == TAG)
81 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
84 /* A generic AST node. All AST nodes consist of this node on the top
85 level with `obj' pointing to the actual content. */
87 tre_ast_type_t type; /* Type of the node. */
88 void *obj; /* Pointer to actual node. */
93 tre_pos_and_tags_t *firstpos;
94 tre_pos_and_tags_t *lastpos;
98 /* A "literal" node. These are created for assertions, back references,
99 tags, matching parameter settings, and all expressions that match one
106 tre_ctype_t *neg_classes;
109 /* A "catenation" node. These are created when two regexps are concatenated.
110 If there are more than one subexpressions in sequence, the `left' part
111 holds all but the last, and `right' part holds the last subexpression
112 (catenation is left associative). */
114 tre_ast_node_t *left;
115 tre_ast_node_t *right;
118 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
121 /* Subexpression to match. */
123 /* Minimum number of consecutive matches. */
125 /* Maximum number of consecutive matches. */
127 /* If 0, match as many characters as possible, if 1 match as few as
128 possible. Note that this does not always mean the same thing as
129 matching as many/few repetitions as possible. */
130 unsigned int minimal:1;
133 /* An "union" node. These are created for the "|" operator. */
135 tre_ast_node_t *left;
136 tre_ast_node_t *right;
139 static tre_ast_node_t *
140 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
142 static tre_ast_node_t *
143 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
145 static tre_ast_node_t *
146 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
149 static tre_ast_node_t *
150 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
152 static tre_ast_node_t *
153 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
154 tre_ast_node_t *right);
157 static tre_ast_node_t *
158 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
160 tre_ast_node_t *node;
162 node = tre_mem_calloc(mem, sizeof(*node));
165 node->obj = tre_mem_calloc(mem, size);
170 node->submatch_id = -1;
175 static tre_ast_node_t *
176 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
178 tre_ast_node_t *node;
181 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
185 lit->code_min = code_min;
186 lit->code_max = code_max;
187 lit->position = position;
192 static tre_ast_node_t *
193 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
196 tre_ast_node_t *node;
197 tre_iteration_t *iter;
199 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
206 iter->minimal = minimal;
207 node->num_submatches = arg->num_submatches;
212 static tre_ast_node_t *
213 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
215 tre_ast_node_t *node;
217 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
220 ((tre_union_t *)node->obj)->left = left;
221 ((tre_union_t *)node->obj)->right = right;
222 node->num_submatches = left->num_submatches + right->num_submatches;
227 static tre_ast_node_t *
228 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
229 tre_ast_node_t *right)
231 tre_ast_node_t *node;
233 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
236 ((tre_catenation_t *)node->obj)->left = left;
237 ((tre_catenation_t *)node->obj)->right = right;
238 node->num_submatches = left->num_submatches + right->num_submatches;
244 /***********************************************************************
245 from tre-stack.c and tre-stack.h
246 ***********************************************************************/
248 typedef struct tre_stack_rec tre_stack_t;
250 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
251 is maximum size, and `increment' specifies how much more space will be
252 allocated with realloc() if all space gets used up. Returns the stack
253 object or NULL if out of memory. */
255 tre_stack_new(int size, int max_size, int increment);
257 /* Frees the stack object. */
259 tre_stack_destroy(tre_stack_t *s);
261 /* Returns the current number of objects in the stack. */
263 tre_stack_num_objects(tre_stack_t *s);
265 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
266 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
267 This tries to realloc() more space before failing if maximum size
268 has not yet been reached. Returns REG_OK if successful. */
269 #define declare_pushf(typetag, type) \
270 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
272 declare_pushf(voidptr, void *);
273 declare_pushf(int, int);
275 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
276 element off of stack `s' and returns it. The stack must not be
278 #define declare_popf(typetag, type) \
279 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
281 declare_popf(voidptr, void *);
282 declare_popf(int, int);
284 /* Just to save some typing. */
285 #define STACK_PUSH(s, typetag, value) \
288 status = tre_stack_push_ ## typetag(s, value); \
290 while (/*CONSTCOND*/0)
292 #define STACK_PUSHX(s, typetag, value) \
294 status = tre_stack_push_ ## typetag(s, value); \
295 if (status != REG_OK) \
299 #define STACK_PUSHR(s, typetag, value) \
301 reg_errcode_t _status; \
302 _status = tre_stack_push_ ## typetag(s, value); \
303 if (_status != REG_OK) \
307 union tre_stack_item {
312 struct tre_stack_rec {
317 union tre_stack_item *stack;
322 tre_stack_new(int size, int max_size, int increment)
326 s = xmalloc(sizeof(*s));
329 s->stack = xmalloc(sizeof(*s->stack) * size);
330 if (s->stack == NULL)
336 s->max_size = max_size;
337 s->increment = increment;
344 tre_stack_destroy(tre_stack_t *s)
351 tre_stack_num_objects(tre_stack_t *s)
357 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
359 if (s->ptr < s->size)
361 s->stack[s->ptr] = value;
366 if (s->size >= s->max_size)
372 union tre_stack_item *new_buffer;
374 new_size = s->size + s->increment;
375 if (new_size > s->max_size)
376 new_size = s->max_size;
377 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
378 if (new_buffer == NULL)
382 assert(new_size > s->size);
384 s->stack = new_buffer;
385 tre_stack_push(s, value);
391 #define define_pushf(typetag, type) \
392 declare_pushf(typetag, type) { \
393 union tre_stack_item item; \
394 item.typetag ## _value = value; \
395 return tre_stack_push(s, item); \
398 define_pushf(int, int)
399 define_pushf(voidptr, void *)
401 #define define_popf(typetag, type) \
402 declare_popf(typetag, type) { \
403 return s->stack[--s->ptr].typetag ## _value; \
406 define_popf(int, int)
407 define_popf(voidptr, void *)
410 /***********************************************************************
411 from tre-parse.c and tre-parse.h
412 ***********************************************************************/
416 /* Memory allocator. The AST is allocated using this. */
418 /* Stack used for keeping track of regexp syntax. */
420 /* The parse result. */
421 tre_ast_node_t *result;
422 /* The regexp to parse and its length. */
424 /* The first character of the entire regexp. */
425 const char *re_start;
426 /* Current submatch ID. */
428 /* Current position (number of literal). */
430 /* The highest back reference or -1 if none seen so far. */
432 /* This flag is set if the regexp uses approximate matching. */
434 /* Compilation flags. */
436 /* If this flag is set the top-level submatch is not captured. */
440 /* Parses a wide character regexp pattern into a syntax tree. This parser
441 handles both syntaxes (BRE and ERE), including the TRE extensions. */
443 tre_parse(tre_parse_ctx_t *ctx);
447 This parser is just a simple recursive descent parser for POSIX.2
448 regexps. The parser supports both the obsolete default syntax and
449 the "extended" syntax, and some nonstandard extensions.
452 /* Characters with special meanings in regexp syntax. */
453 #define CHAR_PIPE '|'
454 #define CHAR_LPAREN '('
455 #define CHAR_RPAREN ')'
456 #define CHAR_LBRACE '{'
457 #define CHAR_RBRACE '}'
458 #define CHAR_LBRACKET '['
459 #define CHAR_RBRACKET ']'
460 #define CHAR_MINUS '-'
461 #define CHAR_STAR '*'
462 #define CHAR_QUESTIONMARK '?'
463 #define CHAR_PLUS '+'
464 #define CHAR_PERIOD '.'
465 #define CHAR_COLON ':'
466 #define CHAR_EQUAL '='
467 #define CHAR_COMMA ','
468 #define CHAR_CARET '^'
469 #define CHAR_DOLLAR '$'
470 #define CHAR_BACKSLASH '\\'
471 #define CHAR_HASH '#'
472 #define CHAR_TILDE '~'
475 /* Some macros for expanding \w, \s, etc. */
476 static const struct tre_macro_struct {
478 const char *expansion;
480 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
481 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
482 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
483 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
488 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
489 must have at least `len' items. Sets buf[0] to zero if the there
490 is no match in `tre_macros'. */
492 tre_expand_macro(const char *regex)
499 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
500 return tre_macros[i].expansion;
504 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
505 tre_ast_node_t ***items)
507 reg_errcode_t status;
508 tre_ast_node_t **array = *items;
509 /* Allocate more space if necessary. */
512 tre_ast_node_t **new_items;
513 /* If the array is already 1024 items large, give up -- there's
514 probably an error in the regexp (e.g. not a '\0' terminated
515 string and missing ']') */
519 new_items = xrealloc(array, sizeof(*items) * *max_i);
520 if (new_items == NULL)
522 *items = array = new_items;
524 array[*i] = tre_ast_new_literal(mem, min, max, -1);
525 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
532 tre_compare_items(const void *a, const void *b)
534 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
535 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
536 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
537 int a_min = l_a->code_min, b_min = l_b->code_min;
541 else if (a_min > b_min)
547 /* Maximum number of character classes that can occur in a negated bracket
549 #define MAX_NEG_CLASSES 64
551 /* Maximum length of character class names. */
552 #define MAX_CLASS_NAME
555 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
556 tre_ctype_t neg_classes[], int *num_neg_classes,
557 tre_ast_node_t ***items, int *num_items,
560 const char *re = ctx->re;
561 reg_errcode_t status = REG_OK;
562 tre_ctype_t class = (tre_ctype_t)0;
564 int max_i = *items_size;
567 /* Build an array of the items in the bracket expression. */
568 while (status == REG_OK)
575 else if (*re == CHAR_RBRACKET && re > ctx->re)
582 tre_cint_t min = 0, max = 0;
584 int clen = mbtowc(&wc, re, -1);
586 if (clen<0) clen=1, wc=WEOF;
588 class = (tre_ctype_t)0;
589 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
593 clen = mbtowc(&wc, re, -1);
594 if (clen<0) clen=1, wc=WEOF;
597 /* XXX - Should use collation order instead of encoding values
598 in character ranges. */
602 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
603 status = REG_ECOLLATE;
604 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
605 status = REG_ECOLLATE;
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
609 const char *endptr = re + 2;
611 while (*endptr && *endptr != CHAR_COLON)
615 len = MIN(endptr - re - 2, 63);
616 strncpy(tmp_str, re + 2, len);
618 class = tre_ctype(tmp_str);
630 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
632 /* Two ranges are not allowed to share and endpoint. */
638 if (status != REG_OK)
642 if (*num_neg_classes >= MAX_NEG_CLASSES)
645 neg_classes[(*num_neg_classes)++] = class;
648 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
649 if (status != REG_OK)
651 ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
654 /* Add opposite-case counterpoints if REG_ICASE is present.
655 This is broken if there are more than two "same" characters. */
656 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
658 tre_cint_t cmin, ccurr;
662 if (tre_islower(min))
664 cmin = ccurr = tre_toupper(min++);
665 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
667 ccurr = tre_toupper(min++);
668 status = tre_new_item(ctx->mem, cmin, ccurr,
671 else if (tre_isupper(min))
673 cmin = ccurr = tre_tolower(min++);
674 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
676 ccurr = tre_tolower(min++);
677 status = tre_new_item(ctx->mem, cmin, ccurr,
681 if (status != REG_OK)
684 if (status != REG_OK)
696 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
698 tre_ast_node_t *node = NULL;
700 reg_errcode_t status = REG_OK;
701 tre_ast_node_t **items, *u, *n;
702 int i = 0, j, max_i = 32, curr_max, curr_min;
703 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
704 int num_neg_classes = 0;
706 /* Start off with an array of `max_i' elements. */
707 items = xmalloc(sizeof(*items) * max_i);
711 if (*ctx->re == CHAR_CARET)
717 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
720 if (status != REG_OK)
721 goto parse_bracket_done;
723 /* Sort the array if we need to negate it. */
725 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
727 curr_max = curr_min = 0;
728 /* Build a union of the items in the array, negated if necessary. */
729 for (j = 0; j < i && status == REG_OK; j++)
732 tre_literal_t *l = items[j]->obj;
741 curr_max = MAX(max + 1, curr_max);
748 if (curr_max >= curr_min)
750 l->code_min = curr_min;
751 l->code_max = curr_max;
757 curr_min = curr_max = max + 1;
764 l->position = ctx->position;
765 if (num_neg_classes > 0)
767 l->neg_classes = tre_mem_alloc(ctx->mem,
768 (sizeof(l->neg_classes)
769 * (num_neg_classes + 1)));
770 if (l->neg_classes == NULL)
775 for (k = 0; k < num_neg_classes; k++)
776 l->neg_classes[k] = neg_classes[k];
777 l->neg_classes[k] = (tre_ctype_t)0;
780 l->neg_classes = NULL;
785 u = tre_ast_new_union(ctx->mem, node, items[j]);
793 if (status != REG_OK)
794 goto parse_bracket_done;
799 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
804 tre_literal_t *l = n->obj;
805 if (num_neg_classes > 0)
807 l->neg_classes = tre_mem_alloc(ctx->mem,
808 (sizeof(l->neg_classes)
809 * (num_neg_classes + 1)));
810 if (l->neg_classes == NULL)
813 goto parse_bracket_done;
815 for (k = 0; k < num_neg_classes; k++)
816 l->neg_classes[k] = neg_classes[k];
817 l->neg_classes[k] = (tre_ctype_t)0;
820 l->neg_classes = NULL;
825 u = tre_ast_new_union(ctx->mem, node, n);
833 if (status != REG_OK)
834 goto parse_bracket_done;
838 #endif /* TRE_DEBUG */
848 /* Parses a positive decimal integer. Returns -1 if the string does not
849 contain a valid number. */
851 tre_parse_int(const char **regex)
854 const char *r = *regex;
859 num = num * 10 + *r - '0';
868 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
871 const char *r = ctx->re;
874 /* Parse number (minimum repetition count). */
876 if (*r >= '0' && *r <= '9') {
877 min = tre_parse_int(&r);
880 /* Parse comma and second number (maximum repetition count). */
882 if (*r == CHAR_COMMA)
885 max = tre_parse_int(&r);
888 /* Check that the repeat counts are sane. */
889 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
896 /* Empty contents of {}. */
900 /* Parse the ending '}' or '\}'.*/
901 if (ctx->cflags & REG_EXTENDED)
903 if (*r != CHAR_RBRACE)
909 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
914 /* Create the AST node(s). */
915 if (min == 0 && max == 0)
917 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
923 if (min < 0 && max < 0)
924 /* Only approximate parameters set, no repetitions. */
927 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
939 PARSE_MARK_FOR_SUBMATCH,
943 PARSE_POST_CATENATION,
948 } tre_parse_re_stack_symbol_t;
952 tre_parse(tre_parse_ctx_t *ctx)
954 tre_ast_node_t *result = NULL;
955 tre_parse_re_stack_symbol_t symbol;
956 reg_errcode_t status = REG_OK;
957 tre_stack_t *stack = ctx->stack;
958 int bottom = tre_stack_num_objects(stack);
963 if (!ctx->nofirstsub)
965 STACK_PUSH(stack, int, ctx->submatch_id);
966 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
969 STACK_PUSH(stack, int, PARSE_RE);
970 ctx->re_start = ctx->re;
973 /* The following is basically just a recursive descent parser. I use
974 an explicit stack instead of recursive functions mostly because of
975 two reasons: compatibility with systems which have an overflowable
976 call stack, and efficiency (both in lines of code and speed). */
977 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
979 if (status != REG_OK)
981 symbol = tre_stack_pop_int(stack);
985 /* Parse a full regexp. A regexp is one or more branches,
986 separated by the union operator `|'. */
987 if (ctx->cflags & REG_EXTENDED)
988 STACK_PUSHX(stack, int, PARSE_UNION);
989 STACK_PUSHX(stack, int, PARSE_BRANCH);
993 /* Parse a branch. A branch is one or more pieces, concatenated.
994 A piece is an atom possibly followed by a postfix operator. */
995 STACK_PUSHX(stack, int, PARSE_CATENATION);
996 STACK_PUSHX(stack, int, PARSE_PIECE);
1000 /* Parse a piece. A piece is an atom possibly followed by one
1001 or more postfix operators. */
1002 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1003 STACK_PUSHX(stack, int, PARSE_ATOM);
1006 case PARSE_CATENATION:
1007 /* If the expression has not ended, parse another piece. */
1013 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1015 if ((ctx->cflags & REG_EXTENDED
1016 && c == CHAR_RPAREN && depth > 0)
1017 || (!(ctx->cflags & REG_EXTENDED)
1018 && (c == CHAR_BACKSLASH
1019 && *(ctx->re + 1) == CHAR_RPAREN)))
1021 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1022 status = REG_EPAREN;
1024 if (!(ctx->cflags & REG_EXTENDED))
1030 /* Default case, left associative concatenation. */
1031 STACK_PUSHX(stack, int, PARSE_CATENATION);
1032 STACK_PUSHX(stack, voidptr, result);
1033 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1034 STACK_PUSHX(stack, int, PARSE_PIECE);
1039 case PARSE_POST_CATENATION:
1041 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1042 tre_ast_node_t *tmp_node;
1043 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1054 STACK_PUSHX(stack, int, PARSE_UNION);
1055 STACK_PUSHX(stack, voidptr, result);
1056 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1057 STACK_PUSHX(stack, int, PARSE_BRANCH);
1070 case PARSE_POST_UNION:
1072 tre_ast_node_t *tmp_node;
1073 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1074 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1082 /* Parse postfix operators. */
1086 case CHAR_QUESTIONMARK:
1087 if (!(ctx->cflags & REG_EXTENDED))
1092 tre_ast_node_t *tmp_node;
1097 if (*ctx->re == CHAR_PLUS)
1099 if (*ctx->re == CHAR_QUESTIONMARK)
1103 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1105 if (tmp_node == NULL)
1108 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1112 case CHAR_BACKSLASH:
1113 /* "\{" is special without REG_EXTENDED */
1114 if (!(ctx->cflags & REG_EXTENDED)
1115 && *(ctx->re + 1) == CHAR_LBRACE)
1124 /* "{" is literal without REG_EXTENDED */
1125 if (!(ctx->cflags & REG_EXTENDED))
1131 status = tre_parse_bound(ctx, &result);
1132 if (status != REG_OK)
1134 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1140 /* Parse an atom. An atom is a regular expression enclosed in `()',
1141 an empty set of `()', a bracket expression, `.', `^', `$',
1142 a `\' followed by a character, or a single character. */
1146 case CHAR_LPAREN: /* parenthesized subexpression */
1148 if (ctx->cflags & REG_EXTENDED)
1154 /* First parse a whole RE, then mark the resulting tree
1156 STACK_PUSHX(stack, int, ctx->submatch_id);
1157 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1158 STACK_PUSHX(stack, int, PARSE_RE);
1166 case CHAR_LBRACKET: /* bracket expression */
1168 status = tre_parse_bracket(ctx, &result);
1169 if (status != REG_OK)
1173 case CHAR_BACKSLASH:
1174 /* If this is "\(" or "\)" chew off the backslash and
1176 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1181 if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
1186 /* If a macro is used, parse the expanded macro recursively. */
1188 const char *buf = tre_expand_macro(ctx->re + 1);
1191 tre_parse_ctx_t subctx;
1192 memcpy(&subctx, ctx, sizeof(subctx));
1194 subctx.nofirstsub = 1;
1195 status = tre_parse(&subctx);
1196 if (status != REG_OK)
1199 ctx->position = subctx.position;
1200 result = subctx.result;
1206 /* Trailing backslash. */
1213 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1218 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1219 ASSERT_AT_WB_NEG, -1);
1223 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1228 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1234 if (ctx->re[0] != CHAR_LBRACE)
1236 /* 8 bit hex char. */
1237 char tmp[3] = {0, 0, 0};
1240 if (tre_isxdigit(ctx->re[0]))
1242 tmp[0] = (char)ctx->re[0];
1245 if (tre_isxdigit(ctx->re[0]))
1247 tmp[1] = (char)ctx->re[0];
1250 val = strtol(tmp, NULL, 16);
1251 result = tre_ast_new_literal(ctx->mem, (int)val,
1252 (int)val, ctx->position);
1263 while (*ctx->re && i < sizeof tmp)
1265 if (ctx->re[0] == CHAR_RBRACE)
1267 if (tre_isxdigit(ctx->re[0]))
1269 tmp[i] = (char)ctx->re[0];
1278 val = strtol(tmp, NULL, 16);
1279 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1287 if (tre_isdigit(*ctx->re))
1289 /* Back reference. */
1290 int val = *ctx->re - '0';
1291 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1296 ctx->max_backref = MAX(val, ctx->max_backref);
1301 /* Escaped character. */
1302 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1313 case CHAR_PERIOD: /* the any-symbol */
1314 if (ctx->cflags & REG_NEWLINE)
1316 tre_ast_node_t *tmp1;
1317 tre_ast_node_t *tmp2;
1318 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1322 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1326 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1333 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1342 case CHAR_CARET: /* beginning of line assertion */
1343 /* '^' has a special meaning everywhere in EREs, and at
1344 beginning of BRE. */
1345 if (ctx->cflags & REG_EXTENDED
1346 || ctx->re == ctx->re_start)
1348 if (!(ctx->cflags & REG_EXTENDED))
1349 STACK_PUSHX(stack, int, PARSE_CATENATION);
1350 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1360 case CHAR_DOLLAR: /* end of line assertion. */
1361 /* '$' is special everywhere in EREs, and in the end of the
1363 if (ctx->cflags & REG_EXTENDED
1366 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1383 case CHAR_QUESTIONMARK:
1384 if (!(ctx->cflags & REG_EXTENDED))
1389 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1397 clen = mbtowc(&wc, ctx->re, -1);
1398 if (clen<0) clen=1, wc=WEOF;
1400 /* Note that we can't use an tre_isalpha() test here, since there
1401 may be characters which are alphabetic but neither upper or
1403 if (ctx->cflags & REG_ICASE
1404 && (tre_isupper(wc) || tre_islower(wc)))
1406 tre_ast_node_t *tmp1;
1407 tre_ast_node_t *tmp2;
1409 /* XXX - Can there be more than one opposite-case
1410 counterpoints for some character in some locale? Or
1411 more than two characters which all should be regarded
1412 the same character if case is ignored? If yes, there
1413 does not seem to be a portable way to detect it. I guess
1414 that at least for multi-character collating elements there
1415 could be several opposite-case counterpoints, but they
1416 cannot be supported portably anyway. */
1417 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1422 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1427 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1433 result = tre_ast_new_literal(ctx->mem, wc, wc,
1444 case PARSE_MARK_FOR_SUBMATCH:
1446 int submatch_id = tre_stack_pop_int(stack);
1448 if (result->submatch_id >= 0)
1450 tre_ast_node_t *n, *tmp_node;
1451 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1454 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1455 if (tmp_node == NULL)
1457 tmp_node->num_submatches = result->num_submatches;
1460 result->submatch_id = submatch_id;
1461 result->num_submatches++;
1465 case PARSE_RESTORE_CFLAGS:
1466 ctx->cflags = tre_stack_pop_int(stack);
1475 /* Check for missing closing parentheses. */
1479 if (status == REG_OK)
1480 ctx->result = result;
1487 /***********************************************************************
1489 ***********************************************************************/
1494 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1499 Algorithms to setup tags so that submatch addressing can be done.
1503 /* Inserts a catenation node to the root of the tree given in `node'.
1504 As the left child a new tag with number `tag_id' to `node' is added,
1505 and the right child is the old root. */
1506 static reg_errcode_t
1507 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1509 tre_catenation_t *c;
1511 c = tre_mem_alloc(mem, sizeof(*c));
1514 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1515 if (c->left == NULL)
1517 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1518 if (c->right == NULL)
1521 c->right->obj = node->obj;
1522 c->right->type = node->type;
1523 c->right->nullable = -1;
1524 c->right->submatch_id = -1;
1525 c->right->firstpos = NULL;
1526 c->right->lastpos = NULL;
1527 c->right->num_tags = 0;
1529 node->type = CATENATION;
1533 /* Inserts a catenation node to the root of the tree given in `node'.
1534 As the right child a new tag with number `tag_id' to `node' is added,
1535 and the left child is the old root. */
1536 static reg_errcode_t
1537 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1539 tre_catenation_t *c;
1541 c = tre_mem_alloc(mem, sizeof(*c));
1544 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1545 if (c->right == NULL)
1547 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1548 if (c->left == NULL)
1551 c->left->obj = node->obj;
1552 c->left->type = node->type;
1553 c->left->nullable = -1;
1554 c->left->submatch_id = -1;
1555 c->left->firstpos = NULL;
1556 c->left->lastpos = NULL;
1557 c->left->num_tags = 0;
1559 node->type = CATENATION;
1565 ADDTAGS_AFTER_ITERATION,
1566 ADDTAGS_AFTER_UNION_LEFT,
1567 ADDTAGS_AFTER_UNION_RIGHT,
1568 ADDTAGS_AFTER_CAT_LEFT,
1569 ADDTAGS_AFTER_CAT_RIGHT,
1570 ADDTAGS_SET_SUBMATCH_END
1571 } tre_addtags_symbol_t;
1580 /* Go through `regset' and set submatch data for submatches that are
1583 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1587 for (i = 0; regset[i] >= 0; i++)
1589 int id = regset[i] / 2;
1590 int start = !(regset[i] % 2);
1592 tnfa->submatch_data[id].so_tag = tag;
1594 tnfa->submatch_data[id].eo_tag = tag;
1600 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1601 subexpressions marked for submatch addressing can be traced. */
1602 static reg_errcode_t
1603 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1606 reg_errcode_t status = REG_OK;
1607 tre_addtags_symbol_t symbol;
1608 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1609 int bottom = tre_stack_num_objects(stack);
1610 /* True for first pass (counting number of needed tags) */
1611 int first_pass = (mem == NULL || tnfa == NULL);
1612 int *regset, *orig_regset;
1613 int num_tags = 0; /* Total number of tags. */
1614 int num_minimals = 0; /* Number of special minimal tags. */
1615 int tag = 0; /* The tag that is to be added next. */
1616 int next_tag = 1; /* Next tag to use after this one. */
1617 int *parents; /* Stack of submatches the current submatch is
1619 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1620 tre_tag_states_t *saved_states;
1622 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1626 tnfa->minimal_tags[0] = -1;
1629 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1633 orig_regset = regset;
1635 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1636 if (parents == NULL)
1643 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1644 if (saved_states == NULL)
1653 for (i = 0; i <= tnfa->num_submatches; i++)
1654 saved_states[i].tag = -1;
1657 STACK_PUSH(stack, voidptr, node);
1658 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1660 while (tre_stack_num_objects(stack) > bottom)
1662 if (status != REG_OK)
1665 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1669 case ADDTAGS_SET_SUBMATCH_END:
1671 int id = tre_stack_pop_int(stack);
1674 /* Add end of this submatch to regset. */
1675 for (i = 0; regset[i] >= 0; i++);
1676 regset[i] = id * 2 + 1;
1679 /* Pop this submatch from the parents stack. */
1680 for (i = 0; parents[i] >= 0; i++);
1681 parents[i - 1] = -1;
1685 case ADDTAGS_RECURSE:
1686 node = tre_stack_pop_voidptr(stack);
1688 if (node->submatch_id >= 0)
1690 int id = node->submatch_id;
1694 /* Add start of this submatch to regset. */
1695 for (i = 0; regset[i] >= 0; i++);
1701 for (i = 0; parents[i] >= 0; i++);
1702 tnfa->submatch_data[id].parents = NULL;
1705 int *p = xmalloc(sizeof(*p) * (i + 1));
1708 status = REG_ESPACE;
1711 assert(tnfa->submatch_data[id].parents == NULL);
1712 tnfa->submatch_data[id].parents = p;
1713 for (i = 0; parents[i] >= 0; i++)
1719 /* Add end of this submatch to regset after processing this
1721 STACK_PUSHX(stack, int, node->submatch_id);
1722 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1729 tre_literal_t *lit = node->obj;
1731 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1736 /* Regset is not empty, so add a tag before the
1737 literal or backref. */
1740 status = tre_add_tag_left(mem, node, tag);
1741 tnfa->tag_directions[tag] = direction;
1742 if (minimal_tag >= 0)
1744 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1745 tnfa->minimal_tags[i] = tag;
1746 tnfa->minimal_tags[i + 1] = minimal_tag;
1747 tnfa->minimal_tags[i + 2] = -1;
1751 tre_purge_regset(regset, tnfa, tag);
1766 assert(!IS_TAG(lit));
1772 tre_catenation_t *cat = node->obj;
1773 tre_ast_node_t *left = cat->left;
1774 tre_ast_node_t *right = cat->right;
1775 int reserved_tag = -1;
1778 /* After processing right child. */
1779 STACK_PUSHX(stack, voidptr, node);
1780 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1782 /* Process right child. */
1783 STACK_PUSHX(stack, voidptr, right);
1784 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1786 /* After processing left child. */
1787 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1788 if (left->num_tags > 0 && right->num_tags > 0)
1790 /* Reserve the next tag to the right child. */
1791 reserved_tag = next_tag;
1794 STACK_PUSHX(stack, int, reserved_tag);
1795 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1797 /* Process left child. */
1798 STACK_PUSHX(stack, voidptr, left);
1799 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1805 tre_iteration_t *iter = node->obj;
1809 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1813 STACK_PUSHX(stack, int, tag);
1814 STACK_PUSHX(stack, int, iter->minimal);
1816 STACK_PUSHX(stack, voidptr, node);
1817 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1819 STACK_PUSHX(stack, voidptr, iter->arg);
1820 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1822 /* Regset is not empty, so add a tag here. */
1823 if (regset[0] >= 0 || iter->minimal)
1828 status = tre_add_tag_left(mem, node, tag);
1830 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1832 tnfa->tag_directions[tag] = direction;
1833 if (minimal_tag >= 0)
1835 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1836 tnfa->minimal_tags[i] = tag;
1837 tnfa->minimal_tags[i + 1] = minimal_tag;
1838 tnfa->minimal_tags[i + 2] = -1;
1842 tre_purge_regset(regset, tnfa, tag);
1850 direction = TRE_TAG_MINIMIZE;
1855 tre_union_t *uni = node->obj;
1856 tre_ast_node_t *left = uni->left;
1857 tre_ast_node_t *right = uni->right;
1863 left_tag = next_tag;
1864 right_tag = next_tag + 1;
1869 right_tag = next_tag;
1872 /* After processing right child. */
1873 STACK_PUSHX(stack, int, right_tag);
1874 STACK_PUSHX(stack, int, left_tag);
1875 STACK_PUSHX(stack, voidptr, regset);
1876 STACK_PUSHX(stack, int, regset[0] >= 0);
1877 STACK_PUSHX(stack, voidptr, node);
1878 STACK_PUSHX(stack, voidptr, right);
1879 STACK_PUSHX(stack, voidptr, left);
1880 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1882 /* Process right child. */
1883 STACK_PUSHX(stack, voidptr, right);
1884 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1886 /* After processing left child. */
1887 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1889 /* Process left child. */
1890 STACK_PUSHX(stack, voidptr, left);
1891 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1893 /* Regset is not empty, so add a tag here. */
1899 status = tre_add_tag_left(mem, node, tag);
1900 tnfa->tag_directions[tag] = direction;
1901 if (minimal_tag >= 0)
1903 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1904 tnfa->minimal_tags[i] = tag;
1905 tnfa->minimal_tags[i + 1] = minimal_tag;
1906 tnfa->minimal_tags[i + 2] = -1;
1910 tre_purge_regset(regset, tnfa, tag);
1919 if (node->num_submatches > 0)
1921 /* The next two tags are reserved for markers. */
1931 if (node->submatch_id >= 0)
1934 /* Push this submatch on the parents stack. */
1935 for (i = 0; parents[i] >= 0; i++);
1936 parents[i] = node->submatch_id;
1937 parents[i + 1] = -1;
1940 break; /* end case: ADDTAGS_RECURSE */
1942 case ADDTAGS_AFTER_ITERATION:
1946 node = tre_stack_pop_voidptr(stack);
1949 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1950 + tre_stack_pop_int(stack);
1955 minimal = tre_stack_pop_int(stack);
1956 enter_tag = tre_stack_pop_int(stack);
1958 minimal_tag = enter_tag;
1964 direction = TRE_TAG_MINIMIZE;
1966 direction = TRE_TAG_MAXIMIZE;
1971 case ADDTAGS_AFTER_CAT_LEFT:
1973 int new_tag = tre_stack_pop_int(stack);
1974 next_tag = tre_stack_pop_int(stack);
1982 case ADDTAGS_AFTER_CAT_RIGHT:
1983 node = tre_stack_pop_voidptr(stack);
1985 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1986 + ((tre_catenation_t *)node->obj)->right->num_tags;
1989 case ADDTAGS_AFTER_UNION_LEFT:
1990 /* Lift the bottom of the `regset' array so that when processing
1991 the right operand the items currently in the array are
1992 invisible. The original bottom was saved at ADDTAGS_UNION and
1993 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1994 while (*regset >= 0)
1998 case ADDTAGS_AFTER_UNION_RIGHT:
2000 int added_tags, tag_left, tag_right;
2001 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2002 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2003 node = tre_stack_pop_voidptr(stack);
2004 added_tags = tre_stack_pop_int(stack);
2007 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2008 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2009 + ((node->num_submatches > 0) ? 2 : 0);
2011 regset = tre_stack_pop_voidptr(stack);
2012 tag_left = tre_stack_pop_int(stack);
2013 tag_right = tre_stack_pop_int(stack);
2015 /* Add tags after both children, the left child gets a smaller
2016 tag than the right child. This guarantees that we prefer
2017 the left child over the right child. */
2018 /* XXX - This is not always necessary (if the children have
2019 tags which must be seen for every match of that child). */
2020 /* XXX - Check if this is the only place where tre_add_tag_right
2021 is used. If so, use tre_add_tag_left (putting the tag before
2022 the child as opposed after the child) and throw away
2023 tre_add_tag_right. */
2024 if (node->num_submatches > 0)
2028 status = tre_add_tag_right(mem, left, tag_left);
2029 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2030 status = tre_add_tag_right(mem, right, tag_right);
2031 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2035 direction = TRE_TAG_MAXIMIZE;
2043 } /* end switch(symbol) */
2044 } /* end while(tre_stack_num_objects(stack) > bottom) */
2047 tre_purge_regset(regset, tnfa, tag);
2049 if (!first_pass && minimal_tag >= 0)
2052 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2053 tnfa->minimal_tags[i] = tag;
2054 tnfa->minimal_tags[i + 1] = minimal_tag;
2055 tnfa->minimal_tags[i + 2] = -1;
2060 assert(tree->num_tags == num_tags);
2061 tnfa->end_tag = num_tags;
2062 tnfa->num_tags = num_tags;
2063 tnfa->num_minimals = num_minimals;
2066 xfree(saved_states);
2073 AST to TNFA compilation routines.
2079 } tre_copyast_symbol_t;
2081 /* Flags for tre_copy_ast(). */
2082 #define COPY_REMOVE_TAGS 1
2083 #define COPY_MAXIMIZE_FIRST_TAG 2
2085 static reg_errcode_t
2086 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2087 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2088 tre_ast_node_t **copy, int *max_pos)
2090 reg_errcode_t status = REG_OK;
2091 int bottom = tre_stack_num_objects(stack);
2094 tre_ast_node_t **result = copy;
2095 tre_copyast_symbol_t symbol;
2097 STACK_PUSH(stack, voidptr, ast);
2098 STACK_PUSH(stack, int, COPY_RECURSE);
2100 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2102 tre_ast_node_t *node;
2103 if (status != REG_OK)
2106 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2109 case COPY_SET_RESULT_PTR:
2110 result = tre_stack_pop_voidptr(stack);
2113 node = tre_stack_pop_voidptr(stack);
2118 tre_literal_t *lit = node->obj;
2119 int pos = lit->position;
2120 int min = lit->code_min;
2121 int max = lit->code_max;
2122 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2124 /* XXX - e.g. [ab] has only one position but two
2125 nodes, so we are creating holes in the state space
2126 here. Not fatal, just wastes memory. */
2130 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2132 /* Change this tag to empty. */
2136 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2139 /* Maximize the first tag. */
2140 tag_directions[max] = TRE_TAG_MAXIMIZE;
2143 *result = tre_ast_new_literal(mem, min, max, pos);
2144 if (*result == NULL)
2145 status = REG_ESPACE;
2153 tre_union_t *uni = node->obj;
2155 *result = tre_ast_new_union(mem, uni->left, uni->right);
2156 if (*result == NULL)
2158 status = REG_ESPACE;
2161 tmp = (*result)->obj;
2162 result = &tmp->left;
2163 STACK_PUSHX(stack, voidptr, uni->right);
2164 STACK_PUSHX(stack, int, COPY_RECURSE);
2165 STACK_PUSHX(stack, voidptr, &tmp->right);
2166 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2167 STACK_PUSHX(stack, voidptr, uni->left);
2168 STACK_PUSHX(stack, int, COPY_RECURSE);
2173 tre_catenation_t *cat = node->obj;
2174 tre_catenation_t *tmp;
2175 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2176 if (*result == NULL)
2178 status = REG_ESPACE;
2181 tmp = (*result)->obj;
2184 result = &tmp->left;
2186 STACK_PUSHX(stack, voidptr, cat->right);
2187 STACK_PUSHX(stack, int, COPY_RECURSE);
2188 STACK_PUSHX(stack, voidptr, &tmp->right);
2189 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2190 STACK_PUSHX(stack, voidptr, cat->left);
2191 STACK_PUSHX(stack, int, COPY_RECURSE);
2196 tre_iteration_t *iter = node->obj;
2197 STACK_PUSHX(stack, voidptr, iter->arg);
2198 STACK_PUSHX(stack, int, COPY_RECURSE);
2199 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2200 iter->max, iter->minimal);
2201 if (*result == NULL)
2203 status = REG_ESPACE;
2206 iter = (*result)->obj;
2207 result = &iter->arg;
2217 *pos_add += num_copied;
2224 } tre_expand_ast_symbol_t;
2226 /* Expands each iteration node that has a finite nonzero minimum or maximum
2227 iteration count to a catenated sequence of copies of the node. */
2228 static reg_errcode_t
2229 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2230 int *position, tre_tag_direction_t *tag_directions)
2232 reg_errcode_t status = REG_OK;
2233 int bottom = tre_stack_num_objects(stack);
2235 int pos_add_total = 0;
2239 STACK_PUSHR(stack, voidptr, ast);
2240 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2241 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2243 tre_ast_node_t *node;
2244 tre_expand_ast_symbol_t symbol;
2246 if (status != REG_OK)
2249 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2250 node = tre_stack_pop_voidptr(stack);
2253 case EXPAND_RECURSE:
2258 tre_literal_t *lit= node->obj;
2259 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2261 lit->position += pos_add;
2262 if (lit->position > max_pos)
2263 max_pos = lit->position;
2269 tre_union_t *uni = node->obj;
2270 STACK_PUSHX(stack, voidptr, uni->right);
2271 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2272 STACK_PUSHX(stack, voidptr, uni->left);
2273 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2278 tre_catenation_t *cat = node->obj;
2279 STACK_PUSHX(stack, voidptr, cat->right);
2280 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2281 STACK_PUSHX(stack, voidptr, cat->left);
2282 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2287 tre_iteration_t *iter = node->obj;
2288 STACK_PUSHX(stack, int, pos_add);
2289 STACK_PUSHX(stack, voidptr, node);
2290 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2291 STACK_PUSHX(stack, voidptr, iter->arg);
2292 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2293 /* If we are going to expand this node at EXPAND_AFTER_ITER
2294 then don't increase the `pos' fields of the nodes now, it
2295 will get done when expanding. */
2296 if (iter->min > 1 || iter->max > 1)
2306 case EXPAND_AFTER_ITER:
2308 tre_iteration_t *iter = node->obj;
2310 pos_add = tre_stack_pop_int(stack);
2311 pos_add_last = pos_add;
2312 if (iter->min > 1 || iter->max > 1)
2314 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2316 int pos_add_save = pos_add;
2318 /* Create a catenated sequence of copies of the node. */
2319 for (j = 0; j < iter->min; j++)
2321 tre_ast_node_t *copy;
2322 /* Remove tags from all but the last copy. */
2323 int flags = ((j + 1 < iter->min)
2325 : COPY_MAXIMIZE_FIRST_TAG);
2326 pos_add_save = pos_add;
2327 status = tre_copy_ast(mem, stack, iter->arg, flags,
2328 &pos_add, tag_directions, ©,
2330 if (status != REG_OK)
2333 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2340 if (iter->max == -1)
2342 /* No upper limit. */
2343 pos_add_save = pos_add;
2344 status = tre_copy_ast(mem, stack, iter->arg, 0,
2345 &pos_add, NULL, &seq2, &max_pos);
2346 if (status != REG_OK)
2348 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2354 for (j = iter->min; j < iter->max; j++)
2356 tre_ast_node_t *tmp, *copy;
2357 pos_add_save = pos_add;
2358 status = tre_copy_ast(mem, stack, iter->arg, 0,
2359 &pos_add, NULL, ©, &max_pos);
2360 if (status != REG_OK)
2363 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2368 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2371 seq2 = tre_ast_new_union(mem, tmp, seq2);
2377 pos_add = pos_add_save;
2380 else if (seq2 != NULL)
2381 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2384 node->obj = seq1->obj;
2385 node->type = seq1->type;
2389 pos_add_total += pos_add - pos_add_last;
2390 if (iter_depth == 0)
2391 pos_add = pos_add_total;
2401 *position += pos_add_total;
2403 /* `max_pos' should never be larger than `*position' if the above
2404 code works, but just an extra safeguard let's make sure
2405 `*position' is set large enough so enough memory will be
2406 allocated for the transition table. */
2407 if (max_pos > *position)
2408 *position = max_pos;
2413 static tre_pos_and_tags_t *
2414 tre_set_empty(tre_mem_t mem)
2416 tre_pos_and_tags_t *new_set;
2418 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2419 if (new_set == NULL)
2422 new_set[0].position = -1;
2423 new_set[0].code_min = -1;
2424 new_set[0].code_max = -1;
2429 static tre_pos_and_tags_t *
2430 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2431 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2433 tre_pos_and_tags_t *new_set;
2435 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2436 if (new_set == NULL)
2439 new_set[0].position = position;
2440 new_set[0].code_min = code_min;
2441 new_set[0].code_max = code_max;
2442 new_set[0].class = class;
2443 new_set[0].neg_classes = neg_classes;
2444 new_set[0].backref = backref;
2445 new_set[1].position = -1;
2446 new_set[1].code_min = -1;
2447 new_set[1].code_max = -1;
2452 static tre_pos_and_tags_t *
2453 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2454 int *tags, int assertions)
2457 tre_pos_and_tags_t *new_set;
2461 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2462 for (s1 = 0; set1[s1].position >= 0; s1++);
2463 for (s2 = 0; set2[s2].position >= 0; s2++);
2464 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2468 for (s1 = 0; set1[s1].position >= 0; s1++)
2470 new_set[s1].position = set1[s1].position;
2471 new_set[s1].code_min = set1[s1].code_min;
2472 new_set[s1].code_max = set1[s1].code_max;
2473 new_set[s1].assertions = set1[s1].assertions | assertions;
2474 new_set[s1].class = set1[s1].class;
2475 new_set[s1].neg_classes = set1[s1].neg_classes;
2476 new_set[s1].backref = set1[s1].backref;
2477 if (set1[s1].tags == NULL && tags == NULL)
2478 new_set[s1].tags = NULL;
2481 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2482 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2483 * (i + num_tags + 1)));
2484 if (new_tags == NULL)
2486 for (j = 0; j < i; j++)
2487 new_tags[j] = set1[s1].tags[j];
2488 for (i = 0; i < num_tags; i++)
2489 new_tags[j + i] = tags[i];
2490 new_tags[j + i] = -1;
2491 new_set[s1].tags = new_tags;
2495 for (s2 = 0; set2[s2].position >= 0; s2++)
2497 new_set[s1 + s2].position = set2[s2].position;
2498 new_set[s1 + s2].code_min = set2[s2].code_min;
2499 new_set[s1 + s2].code_max = set2[s2].code_max;
2500 /* XXX - why not | assertions here as well? */
2501 new_set[s1 + s2].assertions = set2[s2].assertions;
2502 new_set[s1 + s2].class = set2[s2].class;
2503 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2504 new_set[s1 + s2].backref = set2[s2].backref;
2505 if (set2[s2].tags == NULL)
2506 new_set[s1 + s2].tags = NULL;
2509 for (i = 0; set2[s2].tags[i] >= 0; i++);
2510 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2511 if (new_tags == NULL)
2513 for (j = 0; j < i; j++)
2514 new_tags[j] = set2[s2].tags[j];
2516 new_set[s1 + s2].tags = new_tags;
2519 new_set[s1 + s2].position = -1;
2523 /* Finds the empty path through `node' which is the one that should be
2524 taken according to POSIX.2 rules, and adds the tags on that path to
2525 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2526 set to the number of tags seen on the path. */
2527 static reg_errcode_t
2528 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2529 int *assertions, int *num_tags_seen)
2533 tre_catenation_t *cat;
2534 tre_iteration_t *iter;
2536 int bottom = tre_stack_num_objects(stack);
2537 reg_errcode_t status = REG_OK;
2541 status = tre_stack_push_voidptr(stack, node);
2543 /* Walk through the tree recursively. */
2544 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2546 node = tre_stack_pop_voidptr(stack);
2551 lit = (tre_literal_t *)node->obj;
2552 switch (lit->code_min)
2555 if (lit->code_max >= 0)
2559 /* Add the tag to `tags'. */
2560 for (i = 0; tags[i] >= 0; i++)
2561 if (tags[i] == lit->code_max)
2565 tags[i] = lit->code_max;
2574 assert(lit->code_max >= 1
2575 || lit->code_max <= ASSERT_LAST);
2576 if (assertions != NULL)
2577 *assertions |= lit->code_max;
2588 /* Subexpressions starting earlier take priority over ones
2589 starting later, so we prefer the left subexpression over the
2590 right subexpression. */
2591 uni = (tre_union_t *)node->obj;
2592 if (uni->left->nullable)
2593 STACK_PUSHX(stack, voidptr, uni->left)
2594 else if (uni->right->nullable)
2595 STACK_PUSHX(stack, voidptr, uni->right)
2601 /* The path must go through both children. */
2602 cat = (tre_catenation_t *)node->obj;
2603 assert(cat->left->nullable);
2604 assert(cat->right->nullable);
2605 STACK_PUSHX(stack, voidptr, cat->left);
2606 STACK_PUSHX(stack, voidptr, cat->right);
2610 /* A match with an empty string is preferred over no match at
2611 all, so we go through the argument if possible. */
2612 iter = (tre_iteration_t *)node->obj;
2613 if (iter->arg->nullable)
2614 STACK_PUSHX(stack, voidptr, iter->arg);
2630 NFL_POST_CATENATION,
2632 } tre_nfl_stack_symbol_t;
2635 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2636 the nodes of the AST `tree'. */
2637 static reg_errcode_t
2638 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2640 int bottom = tre_stack_num_objects(stack);
2642 STACK_PUSHR(stack, voidptr, tree);
2643 STACK_PUSHR(stack, int, NFL_RECURSE);
2645 while (tre_stack_num_objects(stack) > bottom)
2647 tre_nfl_stack_symbol_t symbol;
2648 tre_ast_node_t *node;
2650 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2651 node = tre_stack_pop_voidptr(stack);
2659 tre_literal_t *lit = (tre_literal_t *)node->obj;
2660 if (IS_BACKREF(lit))
2662 /* Back references: nullable = false, firstpos = {i},
2665 node->firstpos = tre_set_one(mem, lit->position, 0,
2666 TRE_CHAR_MAX, 0, NULL, -1);
2667 if (!node->firstpos)
2669 node->lastpos = tre_set_one(mem, lit->position, 0,
2670 TRE_CHAR_MAX, 0, NULL,
2671 (int)lit->code_max);
2675 else if (lit->code_min < 0)
2677 /* Tags, empty strings, params, and zero width assertions:
2678 nullable = true, firstpos = {}, and lastpos = {}. */
2680 node->firstpos = tre_set_empty(mem);
2681 if (!node->firstpos)
2683 node->lastpos = tre_set_empty(mem);
2689 /* Literal at position i: nullable = false, firstpos = {i},
2693 tre_set_one(mem, lit->position, (int)lit->code_min,
2694 (int)lit->code_max, 0, NULL, -1);
2695 if (!node->firstpos)
2697 node->lastpos = tre_set_one(mem, lit->position,
2700 lit->class, lit->neg_classes,
2709 /* Compute the attributes for the two subtrees, and after that
2711 STACK_PUSHR(stack, voidptr, node);
2712 STACK_PUSHR(stack, int, NFL_POST_UNION);
2713 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2714 STACK_PUSHR(stack, int, NFL_RECURSE);
2715 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2716 STACK_PUSHR(stack, int, NFL_RECURSE);
2720 /* Compute the attributes for the two subtrees, and after that
2722 STACK_PUSHR(stack, voidptr, node);
2723 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2724 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2725 STACK_PUSHR(stack, int, NFL_RECURSE);
2726 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2727 STACK_PUSHR(stack, int, NFL_RECURSE);
2731 /* Compute the attributes for the subtree, and after that for
2733 STACK_PUSHR(stack, voidptr, node);
2734 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2735 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2736 STACK_PUSHR(stack, int, NFL_RECURSE);
2739 break; /* end case: NFL_RECURSE */
2741 case NFL_POST_UNION:
2743 tre_union_t *uni = (tre_union_t *)node->obj;
2744 node->nullable = uni->left->nullable || uni->right->nullable;
2745 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2746 uni->right->firstpos, NULL, 0);
2747 if (!node->firstpos)
2749 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2750 uni->right->lastpos, NULL, 0);
2756 case NFL_POST_ITERATION:
2758 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2760 if (iter->min == 0 || iter->arg->nullable)
2764 node->firstpos = iter->arg->firstpos;
2765 node->lastpos = iter->arg->lastpos;
2769 case NFL_POST_CATENATION:
2771 int num_tags, *tags, assertions;
2772 reg_errcode_t status;
2773 tre_catenation_t *cat = node->obj;
2774 node->nullable = cat->left->nullable && cat->right->nullable;
2776 /* Compute firstpos. */
2777 if (cat->left->nullable)
2779 /* The left side matches the empty string. Make a first pass
2780 with tre_match_empty() to get the number of tags and
2782 status = tre_match_empty(stack, cat->left,
2783 NULL, NULL, &num_tags);
2784 if (status != REG_OK)
2786 /* Allocate arrays for the tags and parameters. */
2787 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2792 /* Second pass with tre_mach_empty() to get the list of
2793 tags and parameters. */
2794 status = tre_match_empty(stack, cat->left, tags,
2796 if (status != REG_OK)
2802 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2805 if (!node->firstpos)
2810 node->firstpos = cat->left->firstpos;
2813 /* Compute lastpos. */
2814 if (cat->right->nullable)
2816 /* The right side matches the empty string. Make a first pass
2817 with tre_match_empty() to get the number of tags and
2819 status = tre_match_empty(stack, cat->right,
2820 NULL, NULL, &num_tags);
2821 if (status != REG_OK)
2823 /* Allocate arrays for the tags and parameters. */
2824 tags = xmalloc(sizeof(int) * (num_tags + 1));
2829 /* Second pass with tre_mach_empty() to get the list of
2830 tags and parameters. */
2831 status = tre_match_empty(stack, cat->right, tags,
2833 if (status != REG_OK)
2839 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2847 node->lastpos = cat->right->lastpos;
2862 /* Adds a transition from each position in `p1' to each position in `p2'. */
2863 static reg_errcode_t
2864 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2865 tre_tnfa_transition_t *transitions,
2866 int *counts, int *offs)
2868 tre_pos_and_tags_t *orig_p2 = p2;
2869 tre_tnfa_transition_t *trans;
2870 int i, j, k, l, dup, prev_p2_pos;
2872 if (transitions != NULL)
2873 while (p1->position >= 0)
2877 while (p2->position >= 0)
2879 /* Optimization: if this position was already handled, skip it. */
2880 if (p2->position == prev_p2_pos)
2885 prev_p2_pos = p2->position;
2886 /* Set `trans' to point to the next unused transition from
2887 position `p1->position'. */
2888 trans = transitions + offs[p1->position];
2889 while (trans->state != NULL)
2892 /* If we find a previous transition from `p1->position' to
2893 `p2->position', it is overwritten. This can happen only
2894 if there are nested loops in the regexp, like in "((a)*)*".
2895 In POSIX.2 repetition using the outer loop is always
2896 preferred over using the inner loop. Therefore the
2897 transition for the inner loop is useless and can be thrown
2899 /* XXX - The same position is used for all nodes in a bracket
2900 expression, so this optimization cannot be used (it will
2901 break bracket expressions) unless I figure out a way to
2903 if (trans->state_id == p2->position)
2911 if (trans->state == NULL)
2912 (trans + 1)->state = NULL;
2913 /* Use the character ranges, assertions, etc. from `p1' for
2914 the transition from `p1' to `p2'. */
2915 trans->code_min = p1->code_min;
2916 trans->code_max = p1->code_max;
2917 trans->state = transitions + offs[p2->position];
2918 trans->state_id = p2->position;
2919 trans->assertions = p1->assertions | p2->assertions
2920 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2921 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2922 if (p1->backref >= 0)
2924 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2925 assert(p2->backref < 0);
2926 trans->u.backref = p1->backref;
2927 trans->assertions |= ASSERT_BACKREF;
2930 trans->u.class = p1->class;
2931 if (p1->neg_classes != NULL)
2933 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2934 trans->neg_classes =
2935 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2936 if (trans->neg_classes == NULL)
2938 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2939 trans->neg_classes[i] = p1->neg_classes[i];
2940 trans->neg_classes[i] = (tre_ctype_t)0;
2943 trans->neg_classes = NULL;
2945 /* Find out how many tags this transition has. */
2947 if (p1->tags != NULL)
2948 while(p1->tags[i] >= 0)
2951 if (p2->tags != NULL)
2952 while(p2->tags[j] >= 0)
2955 /* If we are overwriting a transition, free the old tag array. */
2956 if (trans->tags != NULL)
2960 /* If there were any tags, allocate an array and fill it. */
2963 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2967 if (p1->tags != NULL)
2968 while(p1->tags[i] >= 0)
2970 trans->tags[i] = p1->tags[i];
2975 if (p2->tags != NULL)
2976 while (p2->tags[j] >= 0)
2978 /* Don't add duplicates. */
2980 for (k = 0; k < i; k++)
2981 if (trans->tags[k] == p2->tags[j])
2987 trans->tags[l++] = p2->tags[j];
2990 trans->tags[l] = -1;
2998 /* Compute a maximum limit for the number of transitions leaving
3000 while (p1->position >= 0)
3003 while (p2->position >= 0)
3005 counts[p1->position]++;
3013 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3014 labelled with one character range (there are no transitions on empty
3015 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3017 static reg_errcode_t
3018 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3019 int *counts, int *offs)
3022 tre_catenation_t *cat;
3023 tre_iteration_t *iter;
3024 reg_errcode_t errcode = REG_OK;
3026 /* XXX - recurse using a stack!. */
3032 uni = (tre_union_t *)node->obj;
3033 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3034 if (errcode != REG_OK)
3036 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3040 cat = (tre_catenation_t *)node->obj;
3041 /* Add a transition from each position in cat->left->lastpos
3042 to each position in cat->right->firstpos. */
3043 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3044 transitions, counts, offs);
3045 if (errcode != REG_OK)
3047 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3048 if (errcode != REG_OK)
3050 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3054 iter = (tre_iteration_t *)node->obj;
3055 assert(iter->max == -1 || iter->max == 1);
3057 if (iter->max == -1)
3059 assert(iter->min == 0 || iter->min == 1);
3060 /* Add a transition from each last position in the iterated
3061 expression to each first position. */
3062 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3063 transitions, counts, offs);
3064 if (errcode != REG_OK)
3067 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3074 #define ERROR_EXIT(err) \
3078 if (/*CONSTCOND*/1) \
3081 while (/*CONSTCOND*/0)
3085 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
3088 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3089 tre_pos_and_tags_t *p;
3090 int *counts = NULL, *offs = NULL;
3092 tre_tnfa_transition_t *transitions, *initial;
3093 tre_tnfa_t *tnfa = NULL;
3094 tre_submatch_data_t *submatch_data;
3095 tre_tag_direction_t *tag_directions = NULL;
3096 reg_errcode_t errcode;
3099 /* Parse context. */
3100 tre_parse_ctx_t parse_ctx;
3102 /* Allocate a stack used throughout the compilation process for various
3104 stack = tre_stack_new(512, 10240, 128);
3107 /* Allocate a fast memory allocator. */
3108 mem = tre_mem_new();
3111 tre_stack_destroy(stack);
3115 /* Parse the regexp. */
3116 memset(&parse_ctx, 0, sizeof(parse_ctx));
3117 parse_ctx.mem = mem;
3118 parse_ctx.stack = stack;
3119 parse_ctx.re = regex;
3120 parse_ctx.cflags = cflags;
3121 parse_ctx.max_backref = -1;
3122 errcode = tre_parse(&parse_ctx);
3123 if (errcode != REG_OK)
3124 ERROR_EXIT(errcode);
3125 preg->re_nsub = parse_ctx.submatch_id - 1;
3126 tree = parse_ctx.result;
3128 /* Back references and approximate matching cannot currently be used
3129 in the same regexp. */
3130 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3131 ERROR_EXIT(REG_BADPAT);
3134 tre_ast_print(tree);
3135 #endif /* TRE_DEBUG */
3137 /* Referring to nonexistent subexpressions is illegal. */
3138 if (parse_ctx.max_backref > (int)preg->re_nsub)
3139 ERROR_EXIT(REG_ESUBREG);
3141 /* Allocate the TNFA struct. */
3142 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3144 ERROR_EXIT(REG_ESPACE);
3145 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3146 tnfa->have_approx = parse_ctx.have_approx;
3147 tnfa->num_submatches = parse_ctx.submatch_id;
3149 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3150 regexp does not have back references, this can be skipped. */
3151 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3154 /* Figure out how many tags we will need. */
3155 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3156 if (errcode != REG_OK)
3157 ERROR_EXIT(errcode);
3159 if (tnfa->num_tags > 0)
3161 tag_directions = xmalloc(sizeof(*tag_directions)
3162 * (tnfa->num_tags + 1));
3163 if (tag_directions == NULL)
3164 ERROR_EXIT(REG_ESPACE);
3165 tnfa->tag_directions = tag_directions;
3166 memset(tag_directions, -1,
3167 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3169 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3170 sizeof(tnfa->minimal_tags));
3171 if (tnfa->minimal_tags == NULL)
3172 ERROR_EXIT(REG_ESPACE);
3174 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3175 sizeof(*submatch_data));
3176 if (submatch_data == NULL)
3177 ERROR_EXIT(REG_ESPACE);
3178 tnfa->submatch_data = submatch_data;
3180 errcode = tre_add_tags(mem, stack, tree, tnfa);
3181 if (errcode != REG_OK)
3182 ERROR_EXIT(errcode);
3186 /* Expand iteration nodes. */
3187 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3189 if (errcode != REG_OK)
3190 ERROR_EXIT(errcode);
3192 /* Add a dummy node for the final state.
3193 XXX - For certain patterns this dummy node can be optimized away,
3194 for example "a*" or "ab*". Figure out a simple way to detect
3195 this possibility. */
3197 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3198 if (tmp_ast_r == NULL)
3199 ERROR_EXIT(REG_ESPACE);
3201 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3203 ERROR_EXIT(REG_ESPACE);
3205 errcode = tre_compute_nfl(mem, stack, tree);
3206 if (errcode != REG_OK)
3207 ERROR_EXIT(errcode);
3209 counts = xmalloc(sizeof(int) * parse_ctx.position);
3211 ERROR_EXIT(REG_ESPACE);
3213 offs = xmalloc(sizeof(int) * parse_ctx.position);
3215 ERROR_EXIT(REG_ESPACE);
3217 for (i = 0; i < parse_ctx.position; i++)
3219 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3222 for (i = 0; i < parse_ctx.position; i++)
3225 add += counts[i] + 1;
3228 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3229 if (transitions == NULL)
3230 ERROR_EXIT(REG_ESPACE);
3231 tnfa->transitions = transitions;
3232 tnfa->num_transitions = add;
3234 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3235 if (errcode != REG_OK)
3236 ERROR_EXIT(errcode);
3238 tnfa->firstpos_chars = NULL;
3242 while (p->position >= 0)
3248 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3249 if (initial == NULL)
3250 ERROR_EXIT(REG_ESPACE);
3251 tnfa->initial = initial;
3254 for (p = tree->firstpos; p->position >= 0; p++)
3256 initial[i].state = transitions + offs[p->position];
3257 initial[i].state_id = p->position;
3258 initial[i].tags = NULL;
3259 /* Copy the arrays p->tags, and p->params, they are allocated
3260 from a tre_mem object. */
3264 for (j = 0; p->tags[j] >= 0; j++);
3265 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3266 if (!initial[i].tags)
3267 ERROR_EXIT(REG_ESPACE);
3268 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3270 initial[i].assertions = p->assertions;
3273 initial[i].state = NULL;
3275 tnfa->num_transitions = add;
3276 tnfa->final = transitions + offs[tree->lastpos[0].position];
3277 tnfa->num_states = parse_ctx.position;
3278 tnfa->cflags = cflags;
3280 tre_mem_destroy(mem);
3281 tre_stack_destroy(stack);
3285 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3289 /* Free everything that was allocated and return the error code. */
3290 tre_mem_destroy(mem);
3292 tre_stack_destroy(stack);
3297 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3306 regfree(regex_t *preg)
3310 tre_tnfa_transition_t *trans;
3312 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3316 for (i = 0; i < tnfa->num_transitions; i++)
3317 if (tnfa->transitions[i].state)
3319 if (tnfa->transitions[i].tags)
3320 xfree(tnfa->transitions[i].tags);
3321 if (tnfa->transitions[i].neg_classes)
3322 xfree(tnfa->transitions[i].neg_classes);
3324 if (tnfa->transitions)
3325 xfree(tnfa->transitions);
3329 for (trans = tnfa->initial; trans->state; trans++)
3334 xfree(tnfa->initial);
3337 if (tnfa->submatch_data)
3339 for (i = 0; i < tnfa->num_submatches; i++)
3340 if (tnfa->submatch_data[i].parents)
3341 xfree(tnfa->submatch_data[i].parents);
3342 xfree(tnfa->submatch_data);
3345 if (tnfa->tag_directions)
3346 xfree(tnfa->tag_directions);
3347 if (tnfa->firstpos_chars)
3348 xfree(tnfa->firstpos_chars);
3349 if (tnfa->minimal_tags)
3350 xfree(tnfa->minimal_tags);