nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return REG_BADBR;
 730
 731         if (min == 0 && max == 0)
 732                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 733         else
 734                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 735         if (!ctx->n)
 736                 return REG_ESPACE;
 737         ctx->s = s;
 738         return REG_OK;
 739 }
 740
 741 static int hexval(unsigned c)
 742 {
 743         if (c-'0'<10) return c-'0';
 744         c |= 32;
 745         if (c-'a'<6) return c-'a'+10;
 746         return -1;
 747 }
 748
 749 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 750 {
 751         if (node->submatch_id >= 0) {
 752                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 753                 if (!n)
 754                         return REG_ESPACE;
 755                 n = tre_ast_new_catenation(ctx->mem, n, node);
 756                 if (!n)
 757                         return REG_ESPACE;
 758                 n->num_submatches = node->num_submatches;
 759                 node = n;
 760         }
 761         node->submatch_id = subid;
 762         node->num_submatches++;
 763         ctx->n = node;
 764         return REG_OK;
 765 }
 766
 767 /*
 768 BRE grammar:
 769 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 770 Branch =  Atom  |  Branch Atom
 771 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 772 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 773
 774 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 775
 776 ERE grammar:
 777 Regex  =  Branch  |  Regex '|' Branch
 778 Branch =  Atom  |  Branch Atom
 779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 780 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 781
 782 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 783 */
 784
 785 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 786 {
 787         int len, ere = ctx->cflags & REG_EXTENDED;
 788         const char *p;
 789         tre_ast_node_t *node;
 790         wchar_t wc;
 791         switch (*s) {
 792         case '[':
 793                 return parse_bracket(ctx, s+1);
 794         case '\\':
 795                 p = tre_expand_macro(s+1);
 796                 if (p) {
 797                         /* assume \X expansion is a single atom */
 798                         reg_errcode_t err = parse_atom(ctx, p);
 799                         ctx->s = s+2;
 800                         return err;
 801                 }
 802                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 803                 switch (*++s) {
 804                 case 0:
 805                         return REG_EESCAPE;
 806                 case 'b':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 808                         break;
 809                 case 'B':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 811                         break;
 812                 case '<':
 813                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 814                         break;
 815                 case '>':
 816                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 817                         break;
 818                 case 'x':
 819                         s++;
 820                         int i, v = 0, c;
 821                         len = 2;
 822                         if (*s == '{') {
 823                                 len = 8;
 824                                 s++;
 825                         }
 826                         for (i=0; i<len && v<0x110000; i++) {
 827                                 c = hexval(s[i]);
 828                                 if (c < 0) break;
 829                                 v = 16*v + c;
 830                         }
 831                         s += i;
 832                         if (len == 8) {
 833                                 if (*s != '}')
 834                                         return REG_EBRACE;
 835                                 s++;
 836                         }
 837                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
 838                         s--;
 839                         break;
 840                 case '{':
 841                         /* reject repetitions after empty expression in BRE */
 842                         if (!ere)
 843                                 return REG_BADRPT;
 844                 default:
 845                         if (!ere && (unsigned)*s-'1' < 9) {
 846                                 /* back reference */
 847                                 int val = *s - '0';
 848                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
 849                                 ctx->max_backref = MAX(val, ctx->max_backref);
 850                         } else {
 851                                 /* extension: accept unknown escaped char
 852                                    as a literal */
 853                                 goto parse_literal;
 854                         }
 855                 }
 856                 s++;
 857                 break;
 858         case '.':
 859                 if (ctx->cflags & REG_NEWLINE) {
 860                         tre_ast_node_t *tmp1, *tmp2;
 861                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 862                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 863                         if (tmp1 && tmp2)
 864                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 865                         else
 866                                 node = 0;
 867                 } else {
 868                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 869                 }
 870                 s++;
 871                 break;
 872         case '^':
 873                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 874                 if (!ere && s != ctx->re)
 875                         goto parse_literal;
 876                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 877                 s++;
 878                 break;
 879         case '$':
 880                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 881                 if (!ere && s[1])
 882                         goto parse_literal;
 883                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 884                 s++;
 885                 break;
 886         case '*':
 887                 return REG_BADPAT;
 888         case '{':
 889         case '+':
 890         case '?':
 891                 /* reject repetitions after empty expression in ERE */
 892                 if (ere)
 893                         return REG_BADRPT;
 894         case '|':
 895                 if (!ere)
 896                         goto parse_literal;
 897         case 0:
 898                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 899                 break;
 900         default:
 901 parse_literal:
 902                 len = mbtowc(&wc, s, -1);
 903                 if (len < 0)
 904                         return REG_BADPAT;
 905                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 906                         tre_ast_node_t *tmp1, *tmp2;
 907                         /* multiple opposite case characters are not supported */
 908                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 909                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 910                         if (tmp1 && tmp2)
 911                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 912                         else
 913                                 node = 0;
 914                 } else {
 915                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 916                 }
 917                 ctx->position++;
 918                 s += len;
 919                 break;
 920         }
 921         if (!node)
 922                 return REG_ESPACE;
 923         ctx->n = node;
 924         ctx->s = s;
 925         return REG_OK;
 926 }
 927
 928 #define PUSHPTR(err, s, v) do { \
 929         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 930                 return err; \
 931 } while(0)
 932
 933 #define PUSHINT(err, s, v) do { \
 934         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 935                 return err; \
 936 } while(0)
 937
 938 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 939 {
 940         tre_ast_node_t *nbranch=0, *nunion=0;
 941         int ere = ctx->cflags & REG_EXTENDED;
 942         const char *s = ctx->re;
 943         int subid = 0;
 944         int depth = 0;
 945         reg_errcode_t err;
 946         tre_stack_t *stack = ctx->stack;
 947
 948         PUSHINT(err, stack, subid++);
 949         for (;;) {
 950                 if ((!ere && *s == '\\' && s[1] == '(') ||
 951                     (ere && *s == '(')) {
 952                         PUSHPTR(err, stack, nunion);
 953                         PUSHPTR(err, stack, nbranch);
 954                         PUSHINT(err, stack, subid++);
 955                         s++;
 956                         if (!ere)
 957                                 s++;
 958                         depth++;
 959                         nbranch = nunion = 0;
 960                         continue;
 961                 }
 962                 if ((!ere && *s == '\\' && s[1] == ')') ||
 963                     (ere && *s == ')' && depth)) {
 964                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 965                         if (!ctx->n)
 966                                 return REG_ESPACE;
 967                 } else {
 968                         err = parse_atom(ctx, s);
 969                         if (err != REG_OK)
 970                                 return err;
 971                         s = ctx->s;
 972                 }
 973
 974         parse_iter:
 975                 /* extension: repetitions are rejected after an empty node
 976                    eg. (+), |*, {2}, but assertions are not treated as empty
 977                    so ^* or $? are accepted currently. */
 978                 switch (*s) {
 979                 case '+':
 980                 case '?':
 981                         if (!ere)
 982                                 break;
 983                         /* fallthrough */
 984                 case '*':;
 985                         int min=0, max=-1;
 986                         if (*s == '+')
 987                                 min = 1;
 988                         if (*s == '?')
 989                                 max = 1;
 990                         s++;
 991                         ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 992                         if (!ctx->n)
 993                                 return REG_ESPACE;
 994                         /* extension: multiple consecutive *+?{,} is unspecified,
 995                            but (a+)+ has to be supported so accepting a++ makes
 996                            sense, note however that the RE_DUP_MAX limit can be
 997                            circumvented: (a{255}){255} uses a lot of memory.. */
 998                         goto parse_iter;
 999                 case '\\':
1000                         if (ere || s[1] != '{')
1001                                 break;
1002                         s++;
1003                         goto parse_brace;
1004                 case '{':
1005                         if (!ere)
1006                                 break;
1007                 parse_brace:
1008                         err = parse_dup(ctx, s+1);
1009                         if (err != REG_OK)
1010                                 return err;
1011                         s = ctx->s;
1012                         goto parse_iter;
1013                 }
1014
1015                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1016                 if ((ere && *s == '|') ||
1017                     (ere && *s == ')' && depth) ||
1018                     (!ere && *s == '\\' && s[1] == ')') ||
1019                     !*s) {
1020                         /* extension: empty branch is unspecified (), (|a), (a|)
1021                            here they are not rejected but match on empty string */
1022                         int c = *s;
1023                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1024                         nbranch = 0;
1025                         if (c != '|') {
1026                                 if (c == '\\') {
1027                                         if (!depth) return REG_EPAREN;
1028                                         s+=2;
1029                                 } else if (c == ')')
1030                                         s++;
1031                                 depth--;
1032                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1033                                 if (err != REG_OK)
1034                                         return err;
1035                                 if (!c && depth<0) {
1036                                         ctx->submatch_id = subid;
1037                                         return REG_OK;
1038                                 }
1039                                 if (!c || depth<0)
1040                                         return REG_EPAREN;
1041                                 nbranch = tre_stack_pop_voidptr(stack);
1042                                 nunion = tre_stack_pop_voidptr(stack);
1043                                 goto parse_iter;
1044                         }
1045                         s++;
1046                 }
1047         }
1048 }
1049
1050
1051 /***********************************************************************
1052  from tre-compile.c
1053 ***********************************************************************/
1054
1055
1056 /*
1057   TODO:
1058    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1059      function calls.
1060 */
1061
1062 /*
1063   Algorithms to setup tags so that submatch addressing can be done.
1064 */
1065
1066
1067 /* Inserts a catenation node to the root of the tree given in `node'.
1068    As the left child a new tag with number `tag_id' to `node' is added,
1069    and the right child is the old root. */
1070 static reg_errcode_t
1071 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1072 {
1073   tre_catenation_t *c;
1074
1075   c = tre_mem_alloc(mem, sizeof(*c));
1076   if (c == NULL)
1077     return REG_ESPACE;
1078   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1079   if (c->left == NULL)
1080     return REG_ESPACE;
1081   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1082   if (c->right == NULL)
1083     return REG_ESPACE;
1084
1085   c->right->obj = node->obj;
1086   c->right->type = node->type;
1087   c->right->nullable = -1;
1088   c->right->submatch_id = -1;
1089   c->right->firstpos = NULL;
1090   c->right->lastpos = NULL;
1091   c->right->num_tags = 0;
1092   node->obj = c;
1093   node->type = CATENATION;
1094   return REG_OK;
1095 }
1096
1097 /* Inserts a catenation node to the root of the tree given in `node'.
1098    As the right child a new tag with number `tag_id' to `node' is added,
1099    and the left child is the old root. */
1100 static reg_errcode_t
1101 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1102 {
1103   tre_catenation_t *c;
1104
1105   c = tre_mem_alloc(mem, sizeof(*c));
1106   if (c == NULL)
1107     return REG_ESPACE;
1108   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1109   if (c->right == NULL)
1110     return REG_ESPACE;
1111   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1112   if (c->left == NULL)
1113     return REG_ESPACE;
1114
1115   c->left->obj = node->obj;
1116   c->left->type = node->type;
1117   c->left->nullable = -1;
1118   c->left->submatch_id = -1;
1119   c->left->firstpos = NULL;
1120   c->left->lastpos = NULL;
1121   c->left->num_tags = 0;
1122   node->obj = c;
1123   node->type = CATENATION;
1124   return REG_OK;
1125 }
1126
1127 typedef enum {
1128   ADDTAGS_RECURSE,
1129   ADDTAGS_AFTER_ITERATION,
1130   ADDTAGS_AFTER_UNION_LEFT,
1131   ADDTAGS_AFTER_UNION_RIGHT,
1132   ADDTAGS_AFTER_CAT_LEFT,
1133   ADDTAGS_AFTER_CAT_RIGHT,
1134   ADDTAGS_SET_SUBMATCH_END
1135 } tre_addtags_symbol_t;
1136
1137
1138 typedef struct {
1139   int tag;
1140   int next_tag;
1141 } tre_tag_states_t;
1142
1143
1144 /* Go through `regset' and set submatch data for submatches that are
1145    using this tag. */
1146 static void
1147 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1148 {
1149   int i;
1150
1151   for (i = 0; regset[i] >= 0; i++)
1152     {
1153       int id = regset[i] / 2;
1154       int start = !(regset[i] % 2);
1155       if (start)
1156         tnfa->submatch_data[id].so_tag = tag;
1157       else
1158         tnfa->submatch_data[id].eo_tag = tag;
1159     }
1160   regset[0] = -1;
1161 }
1162
1163
1164 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1165    subexpressions marked for submatch addressing can be traced. */
1166 static reg_errcode_t
1167 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1168              tre_tnfa_t *tnfa)
1169 {
1170   reg_errcode_t status = REG_OK;
1171   tre_addtags_symbol_t symbol;
1172   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1173   int bottom = tre_stack_num_objects(stack);
1174   /* True for first pass (counting number of needed tags) */
1175   int first_pass = (mem == NULL || tnfa == NULL);
1176   int *regset, *orig_regset;
1177   int num_tags = 0; /* Total number of tags. */
1178   int num_minimals = 0;  /* Number of special minimal tags. */
1179   int tag = 0;      /* The tag that is to be added next. */
1180   int next_tag = 1; /* Next tag to use after this one. */
1181   int *parents;     /* Stack of submatches the current submatch is
1182                        contained in. */
1183   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1184   tre_tag_states_t *saved_states;
1185
1186   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1187   if (!first_pass)
1188     {
1189       tnfa->end_tag = 0;
1190       tnfa->minimal_tags[0] = -1;
1191     }
1192
1193   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1194   if (regset == NULL)
1195     return REG_ESPACE;
1196   regset[0] = -1;
1197   orig_regset = regset;
1198
1199   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1200   if (parents == NULL)
1201     {
1202       xfree(regset);
1203       return REG_ESPACE;
1204     }
1205   parents[0] = -1;
1206
1207   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1208   if (saved_states == NULL)
1209     {
1210       xfree(regset);
1211       xfree(parents);
1212       return REG_ESPACE;
1213     }
1214   else
1215     {
1216       unsigned int i;
1217       for (i = 0; i <= tnfa->num_submatches; i++)
1218         saved_states[i].tag = -1;
1219     }
1220
1221   STACK_PUSH(stack, voidptr, node);
1222   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1223
1224   while (tre_stack_num_objects(stack) > bottom)
1225     {
1226       if (status != REG_OK)
1227         break;
1228
1229       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1230       switch (symbol)
1231         {
1232
1233         case ADDTAGS_SET_SUBMATCH_END:
1234           {
1235             int id = tre_stack_pop_int(stack);
1236             int i;
1237
1238             /* Add end of this submatch to regset. */
1239             for (i = 0; regset[i] >= 0; i++);
1240             regset[i] = id * 2 + 1;
1241             regset[i + 1] = -1;
1242
1243             /* Pop this submatch from the parents stack. */
1244             for (i = 0; parents[i] >= 0; i++);
1245             parents[i - 1] = -1;
1246             break;
1247           }
1248
1249         case ADDTAGS_RECURSE:
1250           node = tre_stack_pop_voidptr(stack);
1251
1252           if (node->submatch_id >= 0)
1253             {
1254               int id = node->submatch_id;
1255               int i;
1256
1257
1258               /* Add start of this submatch to regset. */
1259               for (i = 0; regset[i] >= 0; i++);
1260               regset[i] = id * 2;
1261               regset[i + 1] = -1;
1262
1263               if (!first_pass)
1264                 {
1265                   for (i = 0; parents[i] >= 0; i++);
1266                   tnfa->submatch_data[id].parents = NULL;
1267                   if (i > 0)
1268                     {
1269                       int *p = xmalloc(sizeof(*p) * (i + 1));
1270                       if (p == NULL)
1271                         {
1272                           status = REG_ESPACE;
1273                           break;
1274                         }
1275                       assert(tnfa->submatch_data[id].parents == NULL);
1276                       tnfa->submatch_data[id].parents = p;
1277                       for (i = 0; parents[i] >= 0; i++)
1278                         p[i] = parents[i];
1279                       p[i] = -1;
1280                     }
1281                 }
1282
1283               /* Add end of this submatch to regset after processing this
1284                  node. */
1285               STACK_PUSHX(stack, int, node->submatch_id);
1286               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1287             }
1288
1289           switch (node->type)
1290             {
1291             case LITERAL:
1292               {
1293                 tre_literal_t *lit = node->obj;
1294
1295                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1296                   {
1297                     int i;
1298                     if (regset[0] >= 0)
1299                       {
1300                         /* Regset is not empty, so add a tag before the
1301                            literal or backref. */
1302                         if (!first_pass)
1303                           {
1304                             status = tre_add_tag_left(mem, node, tag);
1305                             tnfa->tag_directions[tag] = direction;
1306                             if (minimal_tag >= 0)
1307                               {
1308                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1309                                 tnfa->minimal_tags[i] = tag;
1310                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1311                                 tnfa->minimal_tags[i + 2] = -1;
1312                                 minimal_tag = -1;
1313                                 num_minimals++;
1314                               }
1315                             tre_purge_regset(regset, tnfa, tag);
1316                           }
1317                         else
1318                           {
1319                             node->num_tags = 1;
1320                           }
1321
1322                         regset[0] = -1;
1323                         tag = next_tag;
1324                         num_tags++;
1325                         next_tag++;
1326                       }
1327                   }
1328                 else
1329                   {
1330                     assert(!IS_TAG(lit));
1331                   }
1332                 break;
1333               }
1334             case CATENATION:
1335               {
1336                 tre_catenation_t *cat = node->obj;
1337                 tre_ast_node_t *left = cat->left;
1338                 tre_ast_node_t *right = cat->right;
1339                 int reserved_tag = -1;
1340
1341
1342                 /* After processing right child. */
1343                 STACK_PUSHX(stack, voidptr, node);
1344                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1345
1346                 /* Process right child. */
1347                 STACK_PUSHX(stack, voidptr, right);
1348                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1349
1350                 /* After processing left child. */
1351                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1352                 if (left->num_tags > 0 && right->num_tags > 0)
1353                   {
1354                     /* Reserve the next tag to the right child. */
1355                     reserved_tag = next_tag;
1356                     next_tag++;
1357                   }
1358                 STACK_PUSHX(stack, int, reserved_tag);
1359                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1360
1361                 /* Process left child. */
1362                 STACK_PUSHX(stack, voidptr, left);
1363                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1364
1365                 }
1366               break;
1367             case ITERATION:
1368               {
1369                 tre_iteration_t *iter = node->obj;
1370
1371                 if (first_pass)
1372                   {
1373                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1374                   }
1375                 else
1376                   {
1377                     STACK_PUSHX(stack, int, tag);
1378                     STACK_PUSHX(stack, int, iter->minimal);
1379                   }
1380                 STACK_PUSHX(stack, voidptr, node);
1381                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1382
1383                 STACK_PUSHX(stack, voidptr, iter->arg);
1384                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385
1386                 /* Regset is not empty, so add a tag here. */
1387                 if (regset[0] >= 0 || iter->minimal)
1388                   {
1389                     if (!first_pass)
1390                       {
1391                         int i;
1392                         status = tre_add_tag_left(mem, node, tag);
1393                         if (iter->minimal)
1394                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1395                         else
1396                           tnfa->tag_directions[tag] = direction;
1397                         if (minimal_tag >= 0)
1398                           {
1399                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1400                             tnfa->minimal_tags[i] = tag;
1401                             tnfa->minimal_tags[i + 1] = minimal_tag;
1402                             tnfa->minimal_tags[i + 2] = -1;
1403                             minimal_tag = -1;
1404                             num_minimals++;
1405                           }
1406                         tre_purge_regset(regset, tnfa, tag);
1407                       }
1408
1409                     regset[0] = -1;
1410                     tag = next_tag;
1411                     num_tags++;
1412                     next_tag++;
1413                   }
1414                 direction = TRE_TAG_MINIMIZE;
1415               }
1416               break;
1417             case UNION:
1418               {
1419                 tre_union_t *uni = node->obj;
1420                 tre_ast_node_t *left = uni->left;
1421                 tre_ast_node_t *right = uni->right;
1422                 int left_tag;
1423                 int right_tag;
1424
1425                 if (regset[0] >= 0)
1426                   {
1427                     left_tag = next_tag;
1428                     right_tag = next_tag + 1;
1429                   }
1430                 else
1431                   {
1432                     left_tag = tag;
1433                     right_tag = next_tag;
1434                   }
1435
1436                 /* After processing right child. */
1437                 STACK_PUSHX(stack, int, right_tag);
1438                 STACK_PUSHX(stack, int, left_tag);
1439                 STACK_PUSHX(stack, voidptr, regset);
1440                 STACK_PUSHX(stack, int, regset[0] >= 0);
1441                 STACK_PUSHX(stack, voidptr, node);
1442                 STACK_PUSHX(stack, voidptr, right);
1443                 STACK_PUSHX(stack, voidptr, left);
1444                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1445
1446                 /* Process right child. */
1447                 STACK_PUSHX(stack, voidptr, right);
1448                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1449
1450                 /* After processing left child. */
1451                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1452
1453                 /* Process left child. */
1454                 STACK_PUSHX(stack, voidptr, left);
1455                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1456
1457                 /* Regset is not empty, so add a tag here. */
1458                 if (regset[0] >= 0)
1459                   {
1460                     if (!first_pass)
1461                       {
1462                         int i;
1463                         status = tre_add_tag_left(mem, node, tag);
1464                         tnfa->tag_directions[tag] = direction;
1465                         if (minimal_tag >= 0)
1466                           {
1467                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1468                             tnfa->minimal_tags[i] = tag;
1469                             tnfa->minimal_tags[i + 1] = minimal_tag;
1470                             tnfa->minimal_tags[i + 2] = -1;
1471                             minimal_tag = -1;
1472                             num_minimals++;
1473                           }
1474                         tre_purge_regset(regset, tnfa, tag);
1475                       }
1476
1477                     regset[0] = -1;
1478                     tag = next_tag;
1479                     num_tags++;
1480                     next_tag++;
1481                   }
1482
1483                 if (node->num_submatches > 0)
1484                   {
1485                     /* The next two tags are reserved for markers. */
1486                     next_tag++;
1487                     tag = next_tag;
1488                     next_tag++;
1489                   }
1490
1491                 break;
1492               }
1493             }
1494
1495           if (node->submatch_id >= 0)
1496             {
1497               int i;
1498               /* Push this submatch on the parents stack. */
1499               for (i = 0; parents[i] >= 0; i++);
1500               parents[i] = node->submatch_id;
1501               parents[i + 1] = -1;
1502             }
1503
1504           break; /* end case: ADDTAGS_RECURSE */
1505
1506         case ADDTAGS_AFTER_ITERATION:
1507           {
1508             int minimal = 0;
1509             int enter_tag;
1510             node = tre_stack_pop_voidptr(stack);
1511             if (first_pass)
1512               {
1513                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1514                   + tre_stack_pop_int(stack);
1515                 minimal_tag = -1;
1516               }
1517             else
1518               {
1519                 minimal = tre_stack_pop_int(stack);
1520                 enter_tag = tre_stack_pop_int(stack);
1521                 if (minimal)
1522                   minimal_tag = enter_tag;
1523               }
1524
1525             if (!first_pass)
1526               {
1527                 if (minimal)
1528                   direction = TRE_TAG_MINIMIZE;
1529                 else
1530                   direction = TRE_TAG_MAXIMIZE;
1531               }
1532             break;
1533           }
1534
1535         case ADDTAGS_AFTER_CAT_LEFT:
1536           {
1537             int new_tag = tre_stack_pop_int(stack);
1538             next_tag = tre_stack_pop_int(stack);
1539             if (new_tag >= 0)
1540               {
1541                 tag = new_tag;
1542               }
1543             break;
1544           }
1545
1546         case ADDTAGS_AFTER_CAT_RIGHT:
1547           node = tre_stack_pop_voidptr(stack);
1548           if (first_pass)
1549             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1550               + ((tre_catenation_t *)node->obj)->right->num_tags;
1551           break;
1552
1553         case ADDTAGS_AFTER_UNION_LEFT:
1554           /* Lift the bottom of the `regset' array so that when processing
1555              the right operand the items currently in the array are
1556              invisible.  The original bottom was saved at ADDTAGS_UNION and
1557              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1558           while (*regset >= 0)
1559             regset++;
1560           break;
1561
1562         case ADDTAGS_AFTER_UNION_RIGHT:
1563           {
1564             int added_tags, tag_left, tag_right;
1565             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1566             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1567             node = tre_stack_pop_voidptr(stack);
1568             added_tags = tre_stack_pop_int(stack);
1569             if (first_pass)
1570               {
1571                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1572                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1573                   + ((node->num_submatches > 0) ? 2 : 0);
1574               }
1575             regset = tre_stack_pop_voidptr(stack);
1576             tag_left = tre_stack_pop_int(stack);
1577             tag_right = tre_stack_pop_int(stack);
1578
1579             /* Add tags after both children, the left child gets a smaller
1580                tag than the right child.  This guarantees that we prefer
1581                the left child over the right child. */
1582             /* XXX - This is not always necessary (if the children have
1583                tags which must be seen for every match of that child). */
1584             /* XXX - Check if this is the only place where tre_add_tag_right
1585                is used.  If so, use tre_add_tag_left (putting the tag before
1586                the child as opposed after the child) and throw away
1587                tre_add_tag_right. */
1588             if (node->num_submatches > 0)
1589               {
1590                 if (!first_pass)
1591                   {
1592                     status = tre_add_tag_right(mem, left, tag_left);
1593                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1594                     if (status == REG_OK)
1595                       status = tre_add_tag_right(mem, right, tag_right);
1596                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1597                   }
1598                 num_tags += 2;
1599               }
1600             direction = TRE_TAG_MAXIMIZE;
1601             break;
1602           }
1603
1604         default:
1605           assert(0);
1606           break;
1607
1608         } /* end switch(symbol) */
1609     } /* end while(tre_stack_num_objects(stack) > bottom) */
1610
1611   if (!first_pass)
1612     tre_purge_regset(regset, tnfa, tag);
1613
1614   if (!first_pass && minimal_tag >= 0)
1615     {
1616       int i;
1617       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1618       tnfa->minimal_tags[i] = tag;
1619       tnfa->minimal_tags[i + 1] = minimal_tag;
1620       tnfa->minimal_tags[i + 2] = -1;
1621       minimal_tag = -1;
1622       num_minimals++;
1623     }
1624
1625   assert(tree->num_tags == num_tags);
1626   tnfa->end_tag = num_tags;
1627   tnfa->num_tags = num_tags;
1628   tnfa->num_minimals = num_minimals;
1629   xfree(orig_regset);
1630   xfree(parents);
1631   xfree(saved_states);
1632   return status;
1633 }
1634
1635
1636
1637 /*
1638   AST to TNFA compilation routines.
1639 */
1640
1641 typedef enum {
1642   COPY_RECURSE,
1643   COPY_SET_RESULT_PTR
1644 } tre_copyast_symbol_t;
1645
1646 /* Flags for tre_copy_ast(). */
1647 #define COPY_REMOVE_TAGS         1
1648 #define COPY_MAXIMIZE_FIRST_TAG  2
1649
1650 static reg_errcode_t
1651 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1652              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1653              tre_ast_node_t **copy, int *max_pos)
1654 {
1655   reg_errcode_t status = REG_OK;
1656   int bottom = tre_stack_num_objects(stack);
1657   int num_copied = 0;
1658   int first_tag = 1;
1659   tre_ast_node_t **result = copy;
1660   tre_copyast_symbol_t symbol;
1661
1662   STACK_PUSH(stack, voidptr, ast);
1663   STACK_PUSH(stack, int, COPY_RECURSE);
1664
1665   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1666     {
1667       tre_ast_node_t *node;
1668       if (status != REG_OK)
1669         break;
1670
1671       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1672       switch (symbol)
1673         {
1674         case COPY_SET_RESULT_PTR:
1675           result = tre_stack_pop_voidptr(stack);
1676           break;
1677         case COPY_RECURSE:
1678           node = tre_stack_pop_voidptr(stack);
1679           switch (node->type)
1680             {
1681             case LITERAL:
1682               {
1683                 tre_literal_t *lit = node->obj;
1684                 int pos = lit->position;
1685                 int min = lit->code_min;
1686                 int max = lit->code_max;
1687                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1688                   {
1689                     /* XXX - e.g. [ab] has only one position but two
1690                        nodes, so we are creating holes in the state space
1691                        here.  Not fatal, just wastes memory. */
1692                     pos += *pos_add;
1693                     num_copied++;
1694                   }
1695                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1696                   {
1697                     /* Change this tag to empty. */
1698                     min = EMPTY;
1699                     max = pos = -1;
1700                   }
1701                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1702                          && first_tag)
1703                   {
1704                     /* Maximize the first tag. */
1705                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1706                     first_tag = 0;
1707                   }
1708                 *result = tre_ast_new_literal(mem, min, max, pos);
1709                 if (*result == NULL)
1710                   status = REG_ESPACE;
1711                 else {
1712                   tre_literal_t *p = (*result)->obj;
1713                   p->class = lit->class;
1714                   p->neg_classes = lit->neg_classes;
1715                 }
1716
1717                 if (pos > *max_pos)
1718                   *max_pos = pos;
1719                 break;
1720               }
1721             case UNION:
1722               {
1723                 tre_union_t *uni = node->obj;
1724                 tre_union_t *tmp;
1725                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1726                 if (*result == NULL)
1727                   {
1728                     status = REG_ESPACE;
1729                     break;
1730                   }
1731                 tmp = (*result)->obj;
1732                 result = &tmp->left;
1733                 STACK_PUSHX(stack, voidptr, uni->right);
1734                 STACK_PUSHX(stack, int, COPY_RECURSE);
1735                 STACK_PUSHX(stack, voidptr, &tmp->right);
1736                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1737                 STACK_PUSHX(stack, voidptr, uni->left);
1738                 STACK_PUSHX(stack, int, COPY_RECURSE);
1739                 break;
1740               }
1741             case CATENATION:
1742               {
1743                 tre_catenation_t *cat = node->obj;
1744                 tre_catenation_t *tmp;
1745                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1746                 if (*result == NULL)
1747                   {
1748                     status = REG_ESPACE;
1749                     break;
1750                   }
1751                 tmp = (*result)->obj;
1752                 tmp->left = NULL;
1753                 tmp->right = NULL;
1754                 result = &tmp->left;
1755
1756                 STACK_PUSHX(stack, voidptr, cat->right);
1757                 STACK_PUSHX(stack, int, COPY_RECURSE);
1758                 STACK_PUSHX(stack, voidptr, &tmp->right);
1759                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1760                 STACK_PUSHX(stack, voidptr, cat->left);
1761                 STACK_PUSHX(stack, int, COPY_RECURSE);
1762                 break;
1763               }
1764             case ITERATION:
1765               {
1766                 tre_iteration_t *iter = node->obj;
1767                 STACK_PUSHX(stack, voidptr, iter->arg);
1768                 STACK_PUSHX(stack, int, COPY_RECURSE);
1769                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1770                                            iter->max, iter->minimal);
1771                 if (*result == NULL)
1772                   {
1773                     status = REG_ESPACE;
1774                     break;
1775                   }
1776                 iter = (*result)->obj;
1777                 result = &iter->arg;
1778                 break;
1779               }
1780             default:
1781               assert(0);
1782               break;
1783             }
1784           break;
1785         }
1786     }
1787   *pos_add += num_copied;
1788   return status;
1789 }
1790
1791 typedef enum {
1792   EXPAND_RECURSE,
1793   EXPAND_AFTER_ITER
1794 } tre_expand_ast_symbol_t;
1795
1796 /* Expands each iteration node that has a finite nonzero minimum or maximum
1797    iteration count to a catenated sequence of copies of the node. */
1798 static reg_errcode_t
1799 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1800                int *position, tre_tag_direction_t *tag_directions)
1801 {
1802   reg_errcode_t status = REG_OK;
1803   int bottom = tre_stack_num_objects(stack);
1804   int pos_add = 0;
1805   int pos_add_total = 0;
1806   int max_pos = 0;
1807   int iter_depth = 0;
1808
1809   STACK_PUSHR(stack, voidptr, ast);
1810   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1811   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1812     {
1813       tre_ast_node_t *node;
1814       tre_expand_ast_symbol_t symbol;
1815
1816       if (status != REG_OK)
1817         break;
1818
1819       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1820       node = tre_stack_pop_voidptr(stack);
1821       switch (symbol)
1822         {
1823         case EXPAND_RECURSE:
1824           switch (node->type)
1825             {
1826             case LITERAL:
1827               {
1828                 tre_literal_t *lit= node->obj;
1829                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1830                   {
1831                     lit->position += pos_add;
1832                     if (lit->position > max_pos)
1833                       max_pos = lit->position;
1834                   }
1835                 break;
1836               }
1837             case UNION:
1838               {
1839                 tre_union_t *uni = node->obj;
1840                 STACK_PUSHX(stack, voidptr, uni->right);
1841                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1842                 STACK_PUSHX(stack, voidptr, uni->left);
1843                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1844                 break;
1845               }
1846             case CATENATION:
1847               {
1848                 tre_catenation_t *cat = node->obj;
1849                 STACK_PUSHX(stack, voidptr, cat->right);
1850                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1851                 STACK_PUSHX(stack, voidptr, cat->left);
1852                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1853                 break;
1854               }
1855             case ITERATION:
1856               {
1857                 tre_iteration_t *iter = node->obj;
1858                 STACK_PUSHX(stack, int, pos_add);
1859                 STACK_PUSHX(stack, voidptr, node);
1860                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1861                 STACK_PUSHX(stack, voidptr, iter->arg);
1862                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1863                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1864                    then don't increase the `pos' fields of the nodes now, it
1865                    will get done when expanding. */
1866                 if (iter->min > 1 || iter->max > 1)
1867                   pos_add = 0;
1868                 iter_depth++;
1869                 break;
1870               }
1871             default:
1872               assert(0);
1873               break;
1874             }
1875           break;
1876         case EXPAND_AFTER_ITER:
1877           {
1878             tre_iteration_t *iter = node->obj;
1879             int pos_add_last;
1880             pos_add = tre_stack_pop_int(stack);
1881             pos_add_last = pos_add;
1882             if (iter->min > 1 || iter->max > 1)
1883               {
1884                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1885                 int j;
1886                 int pos_add_save = pos_add;
1887
1888                 /* Create a catenated sequence of copies of the node. */
1889                 for (j = 0; j < iter->min; j++)
1890                   {
1891                     tre_ast_node_t *copy;
1892                     /* Remove tags from all but the last copy. */
1893                     int flags = ((j + 1 < iter->min)
1894                                  ? COPY_REMOVE_TAGS
1895                                  : COPY_MAXIMIZE_FIRST_TAG);
1896                     pos_add_save = pos_add;
1897                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1898                                           &pos_add, tag_directions, &copy,
1899                                           &max_pos);
1900                     if (status != REG_OK)
1901                       return status;
1902                     if (seq1 != NULL)
1903                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1904                     else
1905                       seq1 = copy;
1906                     if (seq1 == NULL)
1907                       return REG_ESPACE;
1908                   }
1909
1910                 if (iter->max == -1)
1911                   {
1912                     /* No upper limit. */
1913                     pos_add_save = pos_add;
1914                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1915                                           &pos_add, NULL, &seq2, &max_pos);
1916                     if (status != REG_OK)
1917                       return status;
1918                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1919                     if (seq2 == NULL)
1920                       return REG_ESPACE;
1921                   }
1922                 else
1923                   {
1924                     for (j = iter->min; j < iter->max; j++)
1925                       {
1926                         tre_ast_node_t *tmp, *copy;
1927                         pos_add_save = pos_add;
1928                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1929                                               &pos_add, NULL, &copy, &max_pos);
1930                         if (status != REG_OK)
1931                           return status;
1932                         if (seq2 != NULL)
1933                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1934                         else
1935                           seq2 = copy;
1936                         if (seq2 == NULL)
1937                           return REG_ESPACE;
1938                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1939                         if (tmp == NULL)
1940                           return REG_ESPACE;
1941                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1942                         if (seq2 == NULL)
1943                           return REG_ESPACE;
1944                       }
1945                   }
1946
1947                 pos_add = pos_add_save;
1948                 if (seq1 == NULL)
1949                   seq1 = seq2;
1950                 else if (seq2 != NULL)
1951                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1952                 if (seq1 == NULL)
1953                   return REG_ESPACE;
1954                 node->obj = seq1->obj;
1955                 node->type = seq1->type;
1956               }
1957
1958             iter_depth--;
1959             pos_add_total += pos_add - pos_add_last;
1960             if (iter_depth == 0)
1961               pos_add = pos_add_total;
1962
1963             break;
1964           }
1965         default:
1966           assert(0);
1967           break;
1968         }
1969     }
1970
1971   *position += pos_add_total;
1972
1973   /* `max_pos' should never be larger than `*position' if the above
1974      code works, but just an extra safeguard let's make sure
1975      `*position' is set large enough so enough memory will be
1976      allocated for the transition table. */
1977   if (max_pos > *position)
1978     *position = max_pos;
1979
1980   return status;
1981 }
1982
1983 static tre_pos_and_tags_t *
1984 tre_set_empty(tre_mem_t mem)
1985 {
1986   tre_pos_and_tags_t *new_set;
1987
1988   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1989   if (new_set == NULL)
1990     return NULL;
1991
1992   new_set[0].position = -1;
1993   new_set[0].code_min = -1;
1994   new_set[0].code_max = -1;
1995
1996   return new_set;
1997 }
1998
1999 static tre_pos_and_tags_t *
2000 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2001             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2002 {
2003   tre_pos_and_tags_t *new_set;
2004
2005   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2006   if (new_set == NULL)
2007     return NULL;
2008
2009   new_set[0].position = position;
2010   new_set[0].code_min = code_min;
2011   new_set[0].code_max = code_max;
2012   new_set[0].class = class;
2013   new_set[0].neg_classes = neg_classes;
2014   new_set[0].backref = backref;
2015   new_set[1].position = -1;
2016   new_set[1].code_min = -1;
2017   new_set[1].code_max = -1;
2018
2019   return new_set;
2020 }
2021
2022 static tre_pos_and_tags_t *
2023 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2024               int *tags, int assertions)
2025 {
2026   int s1, s2, i, j;
2027   tre_pos_and_tags_t *new_set;
2028   int *new_tags;
2029   int num_tags;
2030
2031   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2032   for (s1 = 0; set1[s1].position >= 0; s1++);
2033   for (s2 = 0; set2[s2].position >= 0; s2++);
2034   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2035   if (!new_set )
2036     return NULL;
2037
2038   for (s1 = 0; set1[s1].position >= 0; s1++)
2039     {
2040       new_set[s1].position = set1[s1].position;
2041       new_set[s1].code_min = set1[s1].code_min;
2042       new_set[s1].code_max = set1[s1].code_max;
2043       new_set[s1].assertions = set1[s1].assertions | assertions;
2044       new_set[s1].class = set1[s1].class;
2045       new_set[s1].neg_classes = set1[s1].neg_classes;
2046       new_set[s1].backref = set1[s1].backref;
2047       if (set1[s1].tags == NULL && tags == NULL)
2048         new_set[s1].tags = NULL;
2049       else
2050         {
2051           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2052           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2053                                          * (i + num_tags + 1)));
2054           if (new_tags == NULL)
2055             return NULL;
2056           for (j = 0; j < i; j++)
2057             new_tags[j] = set1[s1].tags[j];
2058           for (i = 0; i < num_tags; i++)
2059             new_tags[j + i] = tags[i];
2060           new_tags[j + i] = -1;
2061           new_set[s1].tags = new_tags;
2062         }
2063     }
2064
2065   for (s2 = 0; set2[s2].position >= 0; s2++)
2066     {
2067       new_set[s1 + s2].position = set2[s2].position;
2068       new_set[s1 + s2].code_min = set2[s2].code_min;
2069       new_set[s1 + s2].code_max = set2[s2].code_max;
2070       /* XXX - why not | assertions here as well? */
2071       new_set[s1 + s2].assertions = set2[s2].assertions;
2072       new_set[s1 + s2].class = set2[s2].class;
2073       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2074       new_set[s1 + s2].backref = set2[s2].backref;
2075       if (set2[s2].tags == NULL)
2076         new_set[s1 + s2].tags = NULL;
2077       else
2078         {
2079           for (i = 0; set2[s2].tags[i] >= 0; i++);
2080           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2081           if (new_tags == NULL)
2082             return NULL;
2083           for (j = 0; j < i; j++)
2084             new_tags[j] = set2[s2].tags[j];
2085           new_tags[j] = -1;
2086           new_set[s1 + s2].tags = new_tags;
2087         }
2088     }
2089   new_set[s1 + s2].position = -1;
2090   return new_set;
2091 }
2092
2093 /* Finds the empty path through `node' which is the one that should be
2094    taken according to POSIX.2 rules, and adds the tags on that path to
2095    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2096    set to the number of tags seen on the path. */
2097 static reg_errcode_t
2098 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2099                 int *assertions, int *num_tags_seen)
2100 {
2101   tre_literal_t *lit;
2102   tre_union_t *uni;
2103   tre_catenation_t *cat;
2104   tre_iteration_t *iter;
2105   int i;
2106   int bottom = tre_stack_num_objects(stack);
2107   reg_errcode_t status = REG_OK;
2108   if (num_tags_seen)
2109     *num_tags_seen = 0;
2110
2111   status = tre_stack_push_voidptr(stack, node);
2112
2113   /* Walk through the tree recursively. */
2114   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2115     {
2116       node = tre_stack_pop_voidptr(stack);
2117
2118       switch (node->type)
2119         {
2120         case LITERAL:
2121           lit = (tre_literal_t *)node->obj;
2122           switch (lit->code_min)
2123             {
2124             case TAG:
2125               if (lit->code_max >= 0)
2126                 {
2127                   if (tags != NULL)
2128                     {
2129                       /* Add the tag to `tags'. */
2130                       for (i = 0; tags[i] >= 0; i++)
2131                         if (tags[i] == lit->code_max)
2132                           break;
2133                       if (tags[i] < 0)
2134                         {
2135                           tags[i] = lit->code_max;
2136                           tags[i + 1] = -1;
2137                         }
2138                     }
2139                   if (num_tags_seen)
2140                     (*num_tags_seen)++;
2141                 }
2142               break;
2143             case ASSERTION:
2144               assert(lit->code_max >= 1
2145                      || lit->code_max <= ASSERT_LAST);
2146               if (assertions != NULL)
2147                 *assertions |= lit->code_max;
2148               break;
2149             case EMPTY:
2150               break;
2151             default:
2152               assert(0);
2153               break;
2154             }
2155           break;
2156
2157         case UNION:
2158           /* Subexpressions starting earlier take priority over ones
2159              starting later, so we prefer the left subexpression over the
2160              right subexpression. */
2161           uni = (tre_union_t *)node->obj;
2162           if (uni->left->nullable)
2163             STACK_PUSHX(stack, voidptr, uni->left)
2164           else if (uni->right->nullable)
2165             STACK_PUSHX(stack, voidptr, uni->right)
2166           else
2167             assert(0);
2168           break;
2169
2170         case CATENATION:
2171           /* The path must go through both children. */
2172           cat = (tre_catenation_t *)node->obj;
2173           assert(cat->left->nullable);
2174           assert(cat->right->nullable);
2175           STACK_PUSHX(stack, voidptr, cat->left);
2176           STACK_PUSHX(stack, voidptr, cat->right);
2177           break;
2178
2179         case ITERATION:
2180           /* A match with an empty string is preferred over no match at
2181              all, so we go through the argument if possible. */
2182           iter = (tre_iteration_t *)node->obj;
2183           if (iter->arg->nullable)
2184             STACK_PUSHX(stack, voidptr, iter->arg);
2185           break;
2186
2187         default:
2188           assert(0);
2189           break;
2190         }
2191     }
2192
2193   return status;
2194 }
2195
2196
2197 typedef enum {
2198   NFL_RECURSE,
2199   NFL_POST_UNION,
2200   NFL_POST_CATENATION,
2201   NFL_POST_ITERATION
2202 } tre_nfl_stack_symbol_t;
2203
2204
2205 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2206    the nodes of the AST `tree'. */
2207 static reg_errcode_t
2208 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2209 {
2210   int bottom = tre_stack_num_objects(stack);
2211
2212   STACK_PUSHR(stack, voidptr, tree);
2213   STACK_PUSHR(stack, int, NFL_RECURSE);
2214
2215   while (tre_stack_num_objects(stack) > bottom)
2216     {
2217       tre_nfl_stack_symbol_t symbol;
2218       tre_ast_node_t *node;
2219
2220       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2221       node = tre_stack_pop_voidptr(stack);
2222       switch (symbol)
2223         {
2224         case NFL_RECURSE:
2225           switch (node->type)
2226             {
2227             case LITERAL:
2228               {
2229                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2230                 if (IS_BACKREF(lit))
2231                   {
2232                     /* Back references: nullable = false, firstpos = {i},
2233                        lastpos = {i}. */
2234                     node->nullable = 0;
2235                     node->firstpos = tre_set_one(mem, lit->position, 0,
2236                                              TRE_CHAR_MAX, 0, NULL, -1);
2237                     if (!node->firstpos)
2238                       return REG_ESPACE;
2239                     node->lastpos = tre_set_one(mem, lit->position, 0,
2240                                                 TRE_CHAR_MAX, 0, NULL,
2241                                                 (int)lit->code_max);
2242                     if (!node->lastpos)
2243                       return REG_ESPACE;
2244                   }
2245                 else if (lit->code_min < 0)
2246                   {
2247                     /* Tags, empty strings, params, and zero width assertions:
2248                        nullable = true, firstpos = {}, and lastpos = {}. */
2249                     node->nullable = 1;
2250                     node->firstpos = tre_set_empty(mem);
2251                     if (!node->firstpos)
2252                       return REG_ESPACE;
2253                     node->lastpos = tre_set_empty(mem);
2254                     if (!node->lastpos)
2255                       return REG_ESPACE;
2256                   }
2257                 else
2258                   {
2259                     /* Literal at position i: nullable = false, firstpos = {i},
2260                        lastpos = {i}. */
2261                     node->nullable = 0;
2262                     node->firstpos =
2263                       tre_set_one(mem, lit->position, (int)lit->code_min,
2264                                   (int)lit->code_max, 0, NULL, -1);
2265                     if (!node->firstpos)
2266                       return REG_ESPACE;
2267                     node->lastpos = tre_set_one(mem, lit->position,
2268                                                 (int)lit->code_min,
2269                                                 (int)lit->code_max,
2270                                                 lit->class, lit->neg_classes,
2271                                                 -1);
2272                     if (!node->lastpos)
2273                       return REG_ESPACE;
2274                   }
2275                 break;
2276               }
2277
2278             case UNION:
2279               /* Compute the attributes for the two subtrees, and after that
2280                  for this node. */
2281               STACK_PUSHR(stack, voidptr, node);
2282               STACK_PUSHR(stack, int, NFL_POST_UNION);
2283               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2284               STACK_PUSHR(stack, int, NFL_RECURSE);
2285               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2286               STACK_PUSHR(stack, int, NFL_RECURSE);
2287               break;
2288
2289             case CATENATION:
2290               /* Compute the attributes for the two subtrees, and after that
2291                  for this node. */
2292               STACK_PUSHR(stack, voidptr, node);
2293               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2294               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2295               STACK_PUSHR(stack, int, NFL_RECURSE);
2296               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2297               STACK_PUSHR(stack, int, NFL_RECURSE);
2298               break;
2299
2300             case ITERATION:
2301               /* Compute the attributes for the subtree, and after that for
2302                  this node. */
2303               STACK_PUSHR(stack, voidptr, node);
2304               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2305               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2306               STACK_PUSHR(stack, int, NFL_RECURSE);
2307               break;
2308             }
2309           break; /* end case: NFL_RECURSE */
2310
2311         case NFL_POST_UNION:
2312           {
2313             tre_union_t *uni = (tre_union_t *)node->obj;
2314             node->nullable = uni->left->nullable || uni->right->nullable;
2315             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2316                                            uni->right->firstpos, NULL, 0);
2317             if (!node->firstpos)
2318               return REG_ESPACE;
2319             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2320                                           uni->right->lastpos, NULL, 0);
2321             if (!node->lastpos)
2322               return REG_ESPACE;
2323             break;
2324           }
2325
2326         case NFL_POST_ITERATION:
2327           {
2328             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2329
2330             if (iter->min == 0 || iter->arg->nullable)
2331               node->nullable = 1;
2332             else
2333               node->nullable = 0;
2334             node->firstpos = iter->arg->firstpos;
2335             node->lastpos = iter->arg->lastpos;
2336             break;
2337           }
2338
2339         case NFL_POST_CATENATION:
2340           {
2341             int num_tags, *tags, assertions;
2342             reg_errcode_t status;
2343             tre_catenation_t *cat = node->obj;
2344             node->nullable = cat->left->nullable && cat->right->nullable;
2345
2346             /* Compute firstpos. */
2347             if (cat->left->nullable)
2348               {
2349                 /* The left side matches the empty string.  Make a first pass
2350                    with tre_match_empty() to get the number of tags and
2351                    parameters. */
2352                 status = tre_match_empty(stack, cat->left,
2353                                          NULL, NULL, &num_tags);
2354                 if (status != REG_OK)
2355                   return status;
2356                 /* Allocate arrays for the tags and parameters. */
2357                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2358                 if (!tags)
2359                   return REG_ESPACE;
2360                 tags[0] = -1;
2361                 assertions = 0;
2362                 /* Second pass with tre_mach_empty() to get the list of
2363                    tags and parameters. */
2364                 status = tre_match_empty(stack, cat->left, tags,
2365                                          &assertions, NULL);
2366                 if (status != REG_OK)
2367                   {
2368                     xfree(tags);
2369                     return status;
2370                   }
2371                 node->firstpos =
2372                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2373                                 tags, assertions);
2374                 xfree(tags);
2375                 if (!node->firstpos)
2376                   return REG_ESPACE;
2377               }
2378             else
2379               {
2380                 node->firstpos = cat->left->firstpos;
2381               }
2382
2383             /* Compute lastpos. */
2384             if (cat->right->nullable)
2385               {
2386                 /* The right side matches the empty string.  Make a first pass
2387                    with tre_match_empty() to get the number of tags and
2388                    parameters. */
2389                 status = tre_match_empty(stack, cat->right,
2390                                          NULL, NULL, &num_tags);
2391                 if (status != REG_OK)
2392                   return status;
2393                 /* Allocate arrays for the tags and parameters. */
2394                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2395                 if (!tags)
2396                   return REG_ESPACE;
2397                 tags[0] = -1;
2398                 assertions = 0;
2399                 /* Second pass with tre_mach_empty() to get the list of
2400                    tags and parameters. */
2401                 status = tre_match_empty(stack, cat->right, tags,
2402                                          &assertions, NULL);
2403                 if (status != REG_OK)
2404                   {
2405                     xfree(tags);
2406                     return status;
2407                   }
2408                 node->lastpos =
2409                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2410                                 tags, assertions);
2411                 xfree(tags);
2412                 if (!node->lastpos)
2413                   return REG_ESPACE;
2414               }
2415             else
2416               {
2417                 node->lastpos = cat->right->lastpos;
2418               }
2419             break;
2420           }
2421
2422         default:
2423           assert(0);
2424           break;
2425         }
2426     }
2427
2428   return REG_OK;
2429 }
2430
2431
2432 /* Adds a transition from each position in `p1' to each position in `p2'. */
2433 static reg_errcode_t
2434 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2435                tre_tnfa_transition_t *transitions,
2436                int *counts, int *offs)
2437 {
2438   tre_pos_and_tags_t *orig_p2 = p2;
2439   tre_tnfa_transition_t *trans;
2440   int i, j, k, l, dup, prev_p2_pos;
2441
2442   if (transitions != NULL)
2443     while (p1->position >= 0)
2444       {
2445         p2 = orig_p2;
2446         prev_p2_pos = -1;
2447         while (p2->position >= 0)
2448           {
2449             /* Optimization: if this position was already handled, skip it. */
2450             if (p2->position == prev_p2_pos)
2451               {
2452                 p2++;
2453                 continue;
2454               }
2455             prev_p2_pos = p2->position;
2456             /* Set `trans' to point to the next unused transition from
2457                position `p1->position'. */
2458             trans = transitions + offs[p1->position];
2459             while (trans->state != NULL)
2460               {
2461 #if 0
2462                 /* If we find a previous transition from `p1->position' to
2463                    `p2->position', it is overwritten.  This can happen only
2464                    if there are nested loops in the regexp, like in "((a)*)*".
2465                    In POSIX.2 repetition using the outer loop is always
2466                    preferred over using the inner loop.  Therefore the
2467                    transition for the inner loop is useless and can be thrown
2468                    away. */
2469                 /* XXX - The same position is used for all nodes in a bracket
2470                    expression, so this optimization cannot be used (it will
2471                    break bracket expressions) unless I figure out a way to
2472                    detect it here. */
2473                 if (trans->state_id == p2->position)
2474                   {
2475                     break;
2476                   }
2477 #endif
2478                 trans++;
2479               }
2480
2481             if (trans->state == NULL)
2482               (trans + 1)->state = NULL;
2483             /* Use the character ranges, assertions, etc. from `p1' for
2484                the transition from `p1' to `p2'. */
2485             trans->code_min = p1->code_min;
2486             trans->code_max = p1->code_max;
2487             trans->state = transitions + offs[p2->position];
2488             trans->state_id = p2->position;
2489             trans->assertions = p1->assertions | p2->assertions
2490               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2491               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2492             if (p1->backref >= 0)
2493               {
2494                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2495                 assert(p2->backref < 0);
2496                 trans->u.backref = p1->backref;
2497                 trans->assertions |= ASSERT_BACKREF;
2498               }
2499             else
2500               trans->u.class = p1->class;
2501             if (p1->neg_classes != NULL)
2502               {
2503                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2504                 trans->neg_classes =
2505                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2506                 if (trans->neg_classes == NULL)
2507                   return REG_ESPACE;
2508                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2509                   trans->neg_classes[i] = p1->neg_classes[i];
2510                 trans->neg_classes[i] = (tre_ctype_t)0;
2511               }
2512             else
2513               trans->neg_classes = NULL;
2514
2515             /* Find out how many tags this transition has. */
2516             i = 0;
2517             if (p1->tags != NULL)
2518               while(p1->tags[i] >= 0)
2519                 i++;
2520             j = 0;
2521             if (p2->tags != NULL)
2522               while(p2->tags[j] >= 0)
2523                 j++;
2524
2525             /* If we are overwriting a transition, free the old tag array. */
2526             if (trans->tags != NULL)
2527               xfree(trans->tags);
2528             trans->tags = NULL;
2529
2530             /* If there were any tags, allocate an array and fill it. */
2531             if (i + j > 0)
2532               {
2533                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2534                 if (!trans->tags)
2535                   return REG_ESPACE;
2536                 i = 0;
2537                 if (p1->tags != NULL)
2538                   while(p1->tags[i] >= 0)
2539                     {
2540                       trans->tags[i] = p1->tags[i];
2541                       i++;
2542                     }
2543                 l = i;
2544                 j = 0;
2545                 if (p2->tags != NULL)
2546                   while (p2->tags[j] >= 0)
2547                     {
2548                       /* Don't add duplicates. */
2549                       dup = 0;
2550                       for (k = 0; k < i; k++)
2551                         if (trans->tags[k] == p2->tags[j])
2552                           {
2553                             dup = 1;
2554                             break;
2555                           }
2556                       if (!dup)
2557                         trans->tags[l++] = p2->tags[j];
2558                       j++;
2559                     }
2560                 trans->tags[l] = -1;
2561               }
2562
2563             p2++;
2564           }
2565         p1++;
2566       }
2567   else
2568     /* Compute a maximum limit for the number of transitions leaving
2569        from each state. */
2570     while (p1->position >= 0)
2571       {
2572         p2 = orig_p2;
2573         while (p2->position >= 0)
2574           {
2575             counts[p1->position]++;
2576             p2++;
2577           }
2578         p1++;
2579       }
2580   return REG_OK;
2581 }
2582
2583 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2584    labelled with one character range (there are no transitions on empty
2585    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2586    the regexp. */
2587 static reg_errcode_t
2588 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2589                 int *counts, int *offs)
2590 {
2591   tre_union_t *uni;
2592   tre_catenation_t *cat;
2593   tre_iteration_t *iter;
2594   reg_errcode_t errcode = REG_OK;
2595
2596   /* XXX - recurse using a stack!. */
2597   switch (node->type)
2598     {
2599     case LITERAL:
2600       break;
2601     case UNION:
2602       uni = (tre_union_t *)node->obj;
2603       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2604       if (errcode != REG_OK)
2605         return errcode;
2606       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2607       break;
2608
2609     case CATENATION:
2610       cat = (tre_catenation_t *)node->obj;
2611       /* Add a transition from each position in cat->left->lastpos
2612          to each position in cat->right->firstpos. */
2613       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2614                                transitions, counts, offs);
2615       if (errcode != REG_OK)
2616         return errcode;
2617       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2618       if (errcode != REG_OK)
2619         return errcode;
2620       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2621       break;
2622
2623     case ITERATION:
2624       iter = (tre_iteration_t *)node->obj;
2625       assert(iter->max == -1 || iter->max == 1);
2626
2627       if (iter->max == -1)
2628         {
2629           assert(iter->min == 0 || iter->min == 1);
2630           /* Add a transition from each last position in the iterated
2631              expression to each first position. */
2632           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2633                                    transitions, counts, offs);
2634           if (errcode != REG_OK)
2635             return errcode;
2636         }
2637       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2638       break;
2639     }
2640   return errcode;
2641 }
2642
2643
2644 #define ERROR_EXIT(err)           \
2645   do                              \
2646     {                             \
2647       errcode = err;              \
2648       if (/*CONSTCOND*/1)         \
2649         goto error_exit;          \
2650     }                             \
2651  while (/*CONSTCOND*/0)
2652
2653
2654 int
2655 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2656 {
2657   tre_stack_t *stack;
2658   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2659   tre_pos_and_tags_t *p;
2660   int *counts = NULL, *offs = NULL;
2661   int i, add = 0;
2662   tre_tnfa_transition_t *transitions, *initial;
2663   tre_tnfa_t *tnfa = NULL;
2664   tre_submatch_data_t *submatch_data;
2665   tre_tag_direction_t *tag_directions = NULL;
2666   reg_errcode_t errcode;
2667   tre_mem_t mem;
2668
2669   /* Parse context. */
2670   tre_parse_ctx_t parse_ctx;
2671
2672   /* Allocate a stack used throughout the compilation process for various
2673      purposes. */
2674   stack = tre_stack_new(512, 10240, 128);
2675   if (!stack)
2676     return REG_ESPACE;
2677   /* Allocate a fast memory allocator. */
2678   mem = tre_mem_new();
2679   if (!mem)
2680     {
2681       tre_stack_destroy(stack);
2682       return REG_ESPACE;
2683     }
2684
2685   /* Parse the regexp. */
2686   memset(&parse_ctx, 0, sizeof(parse_ctx));
2687   parse_ctx.mem = mem;
2688   parse_ctx.stack = stack;
2689   parse_ctx.re = regex;
2690   parse_ctx.cflags = cflags;
2691   parse_ctx.max_backref = -1;
2692   errcode = tre_parse(&parse_ctx);
2693   if (errcode != REG_OK)
2694     ERROR_EXIT(errcode);
2695   preg->re_nsub = parse_ctx.submatch_id - 1;
2696   tree = parse_ctx.n;
2697
2698 #ifdef TRE_DEBUG
2699   tre_ast_print(tree);
2700 #endif /* TRE_DEBUG */
2701
2702   /* Referring to nonexistent subexpressions is illegal. */
2703   if (parse_ctx.max_backref > (int)preg->re_nsub)
2704     ERROR_EXIT(REG_ESUBREG);
2705
2706   /* Allocate the TNFA struct. */
2707   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2708   if (tnfa == NULL)
2709     ERROR_EXIT(REG_ESPACE);
2710   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2711   tnfa->have_approx = 0;
2712   tnfa->num_submatches = parse_ctx.submatch_id;
2713
2714   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2715      regexp does not have back references, this can be skipped. */
2716   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2717     {
2718
2719       /* Figure out how many tags we will need. */
2720       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2721       if (errcode != REG_OK)
2722         ERROR_EXIT(errcode);
2723
2724       if (tnfa->num_tags > 0)
2725         {
2726           tag_directions = xmalloc(sizeof(*tag_directions)
2727                                    * (tnfa->num_tags + 1));
2728           if (tag_directions == NULL)
2729             ERROR_EXIT(REG_ESPACE);
2730           tnfa->tag_directions = tag_directions;
2731           memset(tag_directions, -1,
2732                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2733         }
2734       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2735                                    sizeof(*tnfa->minimal_tags));
2736       if (tnfa->minimal_tags == NULL)
2737         ERROR_EXIT(REG_ESPACE);
2738
2739       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2740                               sizeof(*submatch_data));
2741       if (submatch_data == NULL)
2742         ERROR_EXIT(REG_ESPACE);
2743       tnfa->submatch_data = submatch_data;
2744
2745       errcode = tre_add_tags(mem, stack, tree, tnfa);
2746       if (errcode != REG_OK)
2747         ERROR_EXIT(errcode);
2748
2749     }
2750
2751   /* Expand iteration nodes. */
2752   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2753                            tag_directions);
2754   if (errcode != REG_OK)
2755     ERROR_EXIT(errcode);
2756
2757   /* Add a dummy node for the final state.
2758      XXX - For certain patterns this dummy node can be optimized away,
2759            for example "a*" or "ab*".   Figure out a simple way to detect
2760            this possibility. */
2761   tmp_ast_l = tree;
2762   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2763   if (tmp_ast_r == NULL)
2764     ERROR_EXIT(REG_ESPACE);
2765
2766   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2767   if (tree == NULL)
2768     ERROR_EXIT(REG_ESPACE);
2769
2770   errcode = tre_compute_nfl(mem, stack, tree);
2771   if (errcode != REG_OK)
2772     ERROR_EXIT(errcode);
2773
2774   counts = xmalloc(sizeof(int) * parse_ctx.position);
2775   if (counts == NULL)
2776     ERROR_EXIT(REG_ESPACE);
2777
2778   offs = xmalloc(sizeof(int) * parse_ctx.position);
2779   if (offs == NULL)
2780     ERROR_EXIT(REG_ESPACE);
2781
2782   for (i = 0; i < parse_ctx.position; i++)
2783     counts[i] = 0;
2784   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2785
2786   add = 0;
2787   for (i = 0; i < parse_ctx.position; i++)
2788     {
2789       offs[i] = add;
2790       add += counts[i] + 1;
2791       counts[i] = 0;
2792     }
2793   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2794   if (transitions == NULL)
2795     ERROR_EXIT(REG_ESPACE);
2796   tnfa->transitions = transitions;
2797   tnfa->num_transitions = add;
2798
2799   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2800   if (errcode != REG_OK)
2801     ERROR_EXIT(errcode);
2802
2803   tnfa->firstpos_chars = NULL;
2804
2805   p = tree->firstpos;
2806   i = 0;
2807   while (p->position >= 0)
2808     {
2809       i++;
2810       p++;
2811     }
2812
2813   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2814   if (initial == NULL)
2815     ERROR_EXIT(REG_ESPACE);
2816   tnfa->initial = initial;
2817
2818   i = 0;
2819   for (p = tree->firstpos; p->position >= 0; p++)
2820     {
2821       initial[i].state = transitions + offs[p->position];
2822       initial[i].state_id = p->position;
2823       initial[i].tags = NULL;
2824       /* Copy the arrays p->tags, and p->params, they are allocated
2825          from a tre_mem object. */
2826       if (p->tags)
2827         {
2828           int j;
2829           for (j = 0; p->tags[j] >= 0; j++);
2830           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2831           if (!initial[i].tags)
2832             ERROR_EXIT(REG_ESPACE);
2833           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2834         }
2835       initial[i].assertions = p->assertions;
2836       i++;
2837     }
2838   initial[i].state = NULL;
2839
2840   tnfa->num_transitions = add;
2841   tnfa->final = transitions + offs[tree->lastpos[0].position];
2842   tnfa->num_states = parse_ctx.position;
2843   tnfa->cflags = cflags;
2844
2845   tre_mem_destroy(mem);
2846   tre_stack_destroy(stack);
2847   xfree(counts);
2848   xfree(offs);
2849
2850   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2851   return REG_OK;
2852
2853  error_exit:
2854   /* Free everything that was allocated and return the error code. */
2855   tre_mem_destroy(mem);
2856   if (stack != NULL)
2857     tre_stack_destroy(stack);
2858   if (counts != NULL)
2859     xfree(counts);
2860   if (offs != NULL)
2861     xfree(offs);
2862   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2863   regfree(preg);
2864   return errcode;
2865 }
2866
2867
2868
2869
2870 void
2871 regfree(regex_t *preg)
2872 {
2873   tre_tnfa_t *tnfa;
2874   unsigned int i;
2875   tre_tnfa_transition_t *trans;
2876
2877   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2878   if (!tnfa)
2879     return;
2880
2881   for (i = 0; i < tnfa->num_transitions; i++)
2882     if (tnfa->transitions[i].state)
2883       {
2884         if (tnfa->transitions[i].tags)
2885           xfree(tnfa->transitions[i].tags);
2886         if (tnfa->transitions[i].neg_classes)
2887           xfree(tnfa->transitions[i].neg_classes);
2888       }
2889   if (tnfa->transitions)
2890     xfree(tnfa->transitions);
2891
2892   if (tnfa->initial)
2893     {
2894       for (trans = tnfa->initial; trans->state; trans++)
2895         {
2896           if (trans->tags)
2897             xfree(trans->tags);
2898         }
2899       xfree(tnfa->initial);
2900     }
2901
2902   if (tnfa->submatch_data)
2903     {
2904       for (i = 0; i < tnfa->num_submatches; i++)
2905         if (tnfa->submatch_data[i].parents)
2906           xfree(tnfa->submatch_data[i].parents);
2907       xfree(tnfa->submatch_data);
2908     }
2909
2910   if (tnfa->tag_directions)
2911     xfree(tnfa->tag_directions);
2912   if (tnfa->firstpos_chars)
2913     xfree(tnfa->firstpos_chars);
2914   if (tnfa->minimal_tags)
2915     xfree(tnfa->minimal_tags);
2916   xfree(tnfa);
2917 }