nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return REG_BADBR;
 730
 731         if (min == 0 && max == 0)
 732                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 733         else
 734                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 735         if (!ctx->n)
 736                 return REG_ESPACE;
 737         ctx->s = s;
 738         return REG_OK;
 739 }
 740
 741 static int hexval(unsigned c)
 742 {
 743         if (c-'0'<10) return c-'0';
 744         c |= 32;
 745         if (c-'a'<6) return c-'a'+10;
 746         return -1;
 747 }
 748
 749 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 750 {
 751         if (node->submatch_id >= 0) {
 752                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 753                 if (!n)
 754                         return REG_ESPACE;
 755                 n = tre_ast_new_catenation(ctx->mem, n, node);
 756                 if (!n)
 757                         return REG_ESPACE;
 758                 n->num_submatches = node->num_submatches;
 759                 node = n;
 760         }
 761         node->submatch_id = subid;
 762         node->num_submatches++;
 763         ctx->n = node;
 764         return REG_OK;
 765 }
 766
 767 /*
 768 BRE grammar:
 769 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 770 Branch =  Atom  |  Branch Atom
 771 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 772 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 773
 774 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 775
 776 ERE grammar:
 777 Regex  =  Branch  |  Regex '|' Branch
 778 Branch =  Atom  |  Branch Atom
 779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 780 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 781
 782 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 783 */
 784
 785 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 786 {
 787         int len, ere = ctx->cflags & REG_EXTENDED;
 788         const char *p;
 789         tre_ast_node_t *node;
 790         wchar_t wc;
 791         switch (*s) {
 792         case '[':
 793                 return parse_bracket(ctx, s+1);
 794         case '\\':
 795                 p = tre_expand_macro(s+1);
 796                 if (p) {
 797                         /* assume \X expansion is a single atom */
 798                         reg_errcode_t err = parse_atom(ctx, p);
 799                         ctx->s = s+2;
 800                         return err;
 801                 }
 802                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 803                 switch (*++s) {
 804                 case 0:
 805                         return REG_EESCAPE;
 806                 case 'b':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 808                         break;
 809                 case 'B':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 811                         break;
 812                 case '<':
 813                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 814                         break;
 815                 case '>':
 816                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 817                         break;
 818                 case 'x':
 819                         s++;
 820                         int i, v = 0, c;
 821                         len = 2;
 822                         if (*s == '{') {
 823                                 len = 8;
 824                                 s++;
 825                         }
 826                         for (i=0; i<len && v<0x110000; i++) {
 827                                 c = hexval(s[i]);
 828                                 if (c < 0) break;
 829                                 v = 16*v + c;
 830                         }
 831                         s += i;
 832                         if (len == 8) {
 833                                 if (*s != '}')
 834                                         return REG_EBRACE;
 835                                 s++;
 836                         }
 837                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
 838                         ctx->position++;
 839                         s--;
 840                         break;
 841                 default:
 842                         if (!ere && (unsigned)*s-'1' < 9) {
 843                                 /* back reference */
 844                                 int val = *s - '0';
 845                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
 846                                 ctx->max_backref = MAX(val, ctx->max_backref);
 847                         } else {
 848                                 /* extension: accept unknown escaped char
 849                                    as a literal */
 850                                 goto parse_literal;
 851                         }
 852                         ctx->position++;
 853                 }
 854                 s++;
 855                 break;
 856         case '.':
 857                 if (ctx->cflags & REG_NEWLINE) {
 858                         tre_ast_node_t *tmp1, *tmp2;
 859                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 860                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 861                         if (tmp1 && tmp2)
 862                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 863                         else
 864                                 node = 0;
 865                 } else {
 866                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 867                 }
 868                 s++;
 869                 break;
 870         case '^':
 871                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 872                 if (!ere && s != ctx->re)
 873                         goto parse_literal;
 874                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 875                 s++;
 876                 break;
 877         case '$':
 878                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 879                 if (!ere && s[1])
 880                         goto parse_literal;
 881                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 882                 s++;
 883                 break;
 884         case '*':
 885         case '|':
 886         case '{':
 887         case '+':
 888         case '?':
 889                 if (!ere)
 890                         goto parse_literal;
 891         case 0:
 892                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 893                 break;
 894         default:
 895 parse_literal:
 896                 len = mbtowc(&wc, s, -1);
 897                 if (len < 0)
 898                         return REG_BADPAT;
 899                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 900                         tre_ast_node_t *tmp1, *tmp2;
 901                         /* multiple opposite case characters are not supported */
 902                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 903                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 904                         if (tmp1 && tmp2)
 905                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 906                         else
 907                                 node = 0;
 908                 } else {
 909                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 910                 }
 911                 ctx->position++;
 912                 s += len;
 913                 break;
 914         }
 915         if (!node)
 916                 return REG_ESPACE;
 917         ctx->n = node;
 918         ctx->s = s;
 919         return REG_OK;
 920 }
 921
 922 #define PUSHPTR(err, s, v) do { \
 923         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 924                 return err; \
 925 } while(0)
 926
 927 #define PUSHINT(err, s, v) do { \
 928         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 929                 return err; \
 930 } while(0)
 931
 932 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 933 {
 934         tre_ast_node_t *nbranch=0, *nunion=0;
 935         int ere = ctx->cflags & REG_EXTENDED;
 936         const char *s = ctx->re;
 937         int subid = 0;
 938         int depth = 0;
 939         reg_errcode_t err;
 940         tre_stack_t *stack = ctx->stack;
 941
 942         PUSHINT(err, stack, subid++);
 943         for (;;) {
 944                 if ((!ere && *s == '\\' && s[1] == '(') ||
 945                     (ere && *s == '(')) {
 946                         PUSHPTR(err, stack, nunion);
 947                         PUSHPTR(err, stack, nbranch);
 948                         PUSHINT(err, stack, subid++);
 949                         s++;
 950                         if (!ere)
 951                                 s++;
 952                         depth++;
 953                         nbranch = nunion = 0;
 954                         continue;
 955                 }
 956                 if ((!ere && *s == '\\' && s[1] == ')') ||
 957                     (ere && *s == ')' && depth)) {
 958                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 959                         if (!ctx->n)
 960                                 return REG_ESPACE;
 961                 } else {
 962                         err = parse_atom(ctx, s);
 963                         if (err != REG_OK)
 964                                 return err;
 965                         s = ctx->s;
 966                 }
 967
 968         parse_iter:
 969                 /* extension: repetitions are accepted after an empty node
 970                    eg. (+), ^*, a$?, a|{2} */
 971                 switch (*s) {
 972                 case '+':
 973                 case '?':
 974                         if (!ere)
 975                                 break;
 976                         /* fallthrough */
 977                 case '*':;
 978                         int min=0, max=-1;
 979                         if (*s == '+')
 980                                 min = 1;
 981                         if (*s == '?')
 982                                 max = 1;
 983                         s++;
 984                         ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 985                         if (!ctx->n)
 986                                 return REG_ESPACE;
 987                         /* extension: multiple consecutive *+?{,} is unspecified,
 988                            but (a+)+ has to be supported so accepting a++ makes
 989                            sense, note however that the RE_DUP_MAX limit can be
 990                            circumvented: (a{255}){255} uses a lot of memory.. */
 991                         goto parse_iter;
 992                 case '\\':
 993                         if (ere || s[1] != '{')
 994                                 break;
 995                         s++;
 996                         goto parse_brace;
 997                 case '{':
 998                         if (!ere)
 999                                 break;
1000                 parse_brace:
1001                         err = parse_dup(ctx, s+1);
1002                         if (err != REG_OK)
1003                                 return err;
1004                         s = ctx->s;
1005                         goto parse_iter;
1006                 }
1007
1008                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1009                 if ((ere && *s == '|') ||
1010                     (ere && *s == ')' && depth) ||
1011                     (!ere && *s == '\\' && s[1] == ')') ||
1012                     !*s) {
1013                         /* extension: empty branch is unspecified (), (|a), (a|)
1014                            here they are not rejected but match on empty string */
1015                         int c = *s;
1016                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1017                         nbranch = 0;
1018                         if (c != '|') {
1019                                 if (c == '\\') {
1020                                         if (!depth) return REG_EPAREN;
1021                                         s+=2;
1022                                 } else if (c == ')')
1023                                         s++;
1024                                 depth--;
1025                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1026                                 if (err != REG_OK)
1027                                         return err;
1028                                 if (!c && depth<0) {
1029                                         ctx->submatch_id = subid;
1030                                         return REG_OK;
1031                                 }
1032                                 if (!c || depth<0)
1033                                         return REG_EPAREN;
1034                                 nbranch = tre_stack_pop_voidptr(stack);
1035                                 nunion = tre_stack_pop_voidptr(stack);
1036                                 goto parse_iter;
1037                         }
1038                         s++;
1039                 }
1040         }
1041 }
1042
1043
1044 /***********************************************************************
1045  from tre-compile.c
1046 ***********************************************************************/
1047
1048
1049 /*
1050   TODO:
1051    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1052      function calls.
1053 */
1054
1055 /*
1056   Algorithms to setup tags so that submatch addressing can be done.
1057 */
1058
1059
1060 /* Inserts a catenation node to the root of the tree given in `node'.
1061    As the left child a new tag with number `tag_id' to `node' is added,
1062    and the right child is the old root. */
1063 static reg_errcode_t
1064 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1065 {
1066   tre_catenation_t *c;
1067
1068   c = tre_mem_alloc(mem, sizeof(*c));
1069   if (c == NULL)
1070     return REG_ESPACE;
1071   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1072   if (c->left == NULL)
1073     return REG_ESPACE;
1074   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1075   if (c->right == NULL)
1076     return REG_ESPACE;
1077
1078   c->right->obj = node->obj;
1079   c->right->type = node->type;
1080   c->right->nullable = -1;
1081   c->right->submatch_id = -1;
1082   c->right->firstpos = NULL;
1083   c->right->lastpos = NULL;
1084   c->right->num_tags = 0;
1085   node->obj = c;
1086   node->type = CATENATION;
1087   return REG_OK;
1088 }
1089
1090 /* Inserts a catenation node to the root of the tree given in `node'.
1091    As the right child a new tag with number `tag_id' to `node' is added,
1092    and the left child is the old root. */
1093 static reg_errcode_t
1094 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1095 {
1096   tre_catenation_t *c;
1097
1098   c = tre_mem_alloc(mem, sizeof(*c));
1099   if (c == NULL)
1100     return REG_ESPACE;
1101   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1102   if (c->right == NULL)
1103     return REG_ESPACE;
1104   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1105   if (c->left == NULL)
1106     return REG_ESPACE;
1107
1108   c->left->obj = node->obj;
1109   c->left->type = node->type;
1110   c->left->nullable = -1;
1111   c->left->submatch_id = -1;
1112   c->left->firstpos = NULL;
1113   c->left->lastpos = NULL;
1114   c->left->num_tags = 0;
1115   node->obj = c;
1116   node->type = CATENATION;
1117   return REG_OK;
1118 }
1119
1120 typedef enum {
1121   ADDTAGS_RECURSE,
1122   ADDTAGS_AFTER_ITERATION,
1123   ADDTAGS_AFTER_UNION_LEFT,
1124   ADDTAGS_AFTER_UNION_RIGHT,
1125   ADDTAGS_AFTER_CAT_LEFT,
1126   ADDTAGS_AFTER_CAT_RIGHT,
1127   ADDTAGS_SET_SUBMATCH_END
1128 } tre_addtags_symbol_t;
1129
1130
1131 typedef struct {
1132   int tag;
1133   int next_tag;
1134 } tre_tag_states_t;
1135
1136
1137 /* Go through `regset' and set submatch data for submatches that are
1138    using this tag. */
1139 static void
1140 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1141 {
1142   int i;
1143
1144   for (i = 0; regset[i] >= 0; i++)
1145     {
1146       int id = regset[i] / 2;
1147       int start = !(regset[i] % 2);
1148       if (start)
1149         tnfa->submatch_data[id].so_tag = tag;
1150       else
1151         tnfa->submatch_data[id].eo_tag = tag;
1152     }
1153   regset[0] = -1;
1154 }
1155
1156
1157 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1158    subexpressions marked for submatch addressing can be traced. */
1159 static reg_errcode_t
1160 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1161              tre_tnfa_t *tnfa)
1162 {
1163   reg_errcode_t status = REG_OK;
1164   tre_addtags_symbol_t symbol;
1165   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1166   int bottom = tre_stack_num_objects(stack);
1167   /* True for first pass (counting number of needed tags) */
1168   int first_pass = (mem == NULL || tnfa == NULL);
1169   int *regset, *orig_regset;
1170   int num_tags = 0; /* Total number of tags. */
1171   int num_minimals = 0;  /* Number of special minimal tags. */
1172   int tag = 0;      /* The tag that is to be added next. */
1173   int next_tag = 1; /* Next tag to use after this one. */
1174   int *parents;     /* Stack of submatches the current submatch is
1175                        contained in. */
1176   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1177   tre_tag_states_t *saved_states;
1178
1179   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1180   if (!first_pass)
1181     {
1182       tnfa->end_tag = 0;
1183       tnfa->minimal_tags[0] = -1;
1184     }
1185
1186   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1187   if (regset == NULL)
1188     return REG_ESPACE;
1189   regset[0] = -1;
1190   orig_regset = regset;
1191
1192   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1193   if (parents == NULL)
1194     {
1195       xfree(regset);
1196       return REG_ESPACE;
1197     }
1198   parents[0] = -1;
1199
1200   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1201   if (saved_states == NULL)
1202     {
1203       xfree(regset);
1204       xfree(parents);
1205       return REG_ESPACE;
1206     }
1207   else
1208     {
1209       unsigned int i;
1210       for (i = 0; i <= tnfa->num_submatches; i++)
1211         saved_states[i].tag = -1;
1212     }
1213
1214   STACK_PUSH(stack, voidptr, node);
1215   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1216
1217   while (tre_stack_num_objects(stack) > bottom)
1218     {
1219       if (status != REG_OK)
1220         break;
1221
1222       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1223       switch (symbol)
1224         {
1225
1226         case ADDTAGS_SET_SUBMATCH_END:
1227           {
1228             int id = tre_stack_pop_int(stack);
1229             int i;
1230
1231             /* Add end of this submatch to regset. */
1232             for (i = 0; regset[i] >= 0; i++);
1233             regset[i] = id * 2 + 1;
1234             regset[i + 1] = -1;
1235
1236             /* Pop this submatch from the parents stack. */
1237             for (i = 0; parents[i] >= 0; i++);
1238             parents[i - 1] = -1;
1239             break;
1240           }
1241
1242         case ADDTAGS_RECURSE:
1243           node = tre_stack_pop_voidptr(stack);
1244
1245           if (node->submatch_id >= 0)
1246             {
1247               int id = node->submatch_id;
1248               int i;
1249
1250
1251               /* Add start of this submatch to regset. */
1252               for (i = 0; regset[i] >= 0; i++);
1253               regset[i] = id * 2;
1254               regset[i + 1] = -1;
1255
1256               if (!first_pass)
1257                 {
1258                   for (i = 0; parents[i] >= 0; i++);
1259                   tnfa->submatch_data[id].parents = NULL;
1260                   if (i > 0)
1261                     {
1262                       int *p = xmalloc(sizeof(*p) * (i + 1));
1263                       if (p == NULL)
1264                         {
1265                           status = REG_ESPACE;
1266                           break;
1267                         }
1268                       assert(tnfa->submatch_data[id].parents == NULL);
1269                       tnfa->submatch_data[id].parents = p;
1270                       for (i = 0; parents[i] >= 0; i++)
1271                         p[i] = parents[i];
1272                       p[i] = -1;
1273                     }
1274                 }
1275
1276               /* Add end of this submatch to regset after processing this
1277                  node. */
1278               STACK_PUSHX(stack, int, node->submatch_id);
1279               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1280             }
1281
1282           switch (node->type)
1283             {
1284             case LITERAL:
1285               {
1286                 tre_literal_t *lit = node->obj;
1287
1288                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1289                   {
1290                     int i;
1291                     if (regset[0] >= 0)
1292                       {
1293                         /* Regset is not empty, so add a tag before the
1294                            literal or backref. */
1295                         if (!first_pass)
1296                           {
1297                             status = tre_add_tag_left(mem, node, tag);
1298                             tnfa->tag_directions[tag] = direction;
1299                             if (minimal_tag >= 0)
1300                               {
1301                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1302                                 tnfa->minimal_tags[i] = tag;
1303                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1304                                 tnfa->minimal_tags[i + 2] = -1;
1305                                 minimal_tag = -1;
1306                                 num_minimals++;
1307                               }
1308                             tre_purge_regset(regset, tnfa, tag);
1309                           }
1310                         else
1311                           {
1312                             node->num_tags = 1;
1313                           }
1314
1315                         regset[0] = -1;
1316                         tag = next_tag;
1317                         num_tags++;
1318                         next_tag++;
1319                       }
1320                   }
1321                 else
1322                   {
1323                     assert(!IS_TAG(lit));
1324                   }
1325                 break;
1326               }
1327             case CATENATION:
1328               {
1329                 tre_catenation_t *cat = node->obj;
1330                 tre_ast_node_t *left = cat->left;
1331                 tre_ast_node_t *right = cat->right;
1332                 int reserved_tag = -1;
1333
1334
1335                 /* After processing right child. */
1336                 STACK_PUSHX(stack, voidptr, node);
1337                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1338
1339                 /* Process right child. */
1340                 STACK_PUSHX(stack, voidptr, right);
1341                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1342
1343                 /* After processing left child. */
1344                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1345                 if (left->num_tags > 0 && right->num_tags > 0)
1346                   {
1347                     /* Reserve the next tag to the right child. */
1348                     reserved_tag = next_tag;
1349                     next_tag++;
1350                   }
1351                 STACK_PUSHX(stack, int, reserved_tag);
1352                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1353
1354                 /* Process left child. */
1355                 STACK_PUSHX(stack, voidptr, left);
1356                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1357
1358                 }
1359               break;
1360             case ITERATION:
1361               {
1362                 tre_iteration_t *iter = node->obj;
1363
1364                 if (first_pass)
1365                   {
1366                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1367                   }
1368                 else
1369                   {
1370                     STACK_PUSHX(stack, int, tag);
1371                     STACK_PUSHX(stack, int, iter->minimal);
1372                   }
1373                 STACK_PUSHX(stack, voidptr, node);
1374                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1375
1376                 STACK_PUSHX(stack, voidptr, iter->arg);
1377                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1378
1379                 /* Regset is not empty, so add a tag here. */
1380                 if (regset[0] >= 0 || iter->minimal)
1381                   {
1382                     if (!first_pass)
1383                       {
1384                         int i;
1385                         status = tre_add_tag_left(mem, node, tag);
1386                         if (iter->minimal)
1387                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1388                         else
1389                           tnfa->tag_directions[tag] = direction;
1390                         if (minimal_tag >= 0)
1391                           {
1392                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1393                             tnfa->minimal_tags[i] = tag;
1394                             tnfa->minimal_tags[i + 1] = minimal_tag;
1395                             tnfa->minimal_tags[i + 2] = -1;
1396                             minimal_tag = -1;
1397                             num_minimals++;
1398                           }
1399                         tre_purge_regset(regset, tnfa, tag);
1400                       }
1401
1402                     regset[0] = -1;
1403                     tag = next_tag;
1404                     num_tags++;
1405                     next_tag++;
1406                   }
1407                 direction = TRE_TAG_MINIMIZE;
1408               }
1409               break;
1410             case UNION:
1411               {
1412                 tre_union_t *uni = node->obj;
1413                 tre_ast_node_t *left = uni->left;
1414                 tre_ast_node_t *right = uni->right;
1415                 int left_tag;
1416                 int right_tag;
1417
1418                 if (regset[0] >= 0)
1419                   {
1420                     left_tag = next_tag;
1421                     right_tag = next_tag + 1;
1422                   }
1423                 else
1424                   {
1425                     left_tag = tag;
1426                     right_tag = next_tag;
1427                   }
1428
1429                 /* After processing right child. */
1430                 STACK_PUSHX(stack, int, right_tag);
1431                 STACK_PUSHX(stack, int, left_tag);
1432                 STACK_PUSHX(stack, voidptr, regset);
1433                 STACK_PUSHX(stack, int, regset[0] >= 0);
1434                 STACK_PUSHX(stack, voidptr, node);
1435                 STACK_PUSHX(stack, voidptr, right);
1436                 STACK_PUSHX(stack, voidptr, left);
1437                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1438
1439                 /* Process right child. */
1440                 STACK_PUSHX(stack, voidptr, right);
1441                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1442
1443                 /* After processing left child. */
1444                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1445
1446                 /* Process left child. */
1447                 STACK_PUSHX(stack, voidptr, left);
1448                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1449
1450                 /* Regset is not empty, so add a tag here. */
1451                 if (regset[0] >= 0)
1452                   {
1453                     if (!first_pass)
1454                       {
1455                         int i;
1456                         status = tre_add_tag_left(mem, node, tag);
1457                         tnfa->tag_directions[tag] = direction;
1458                         if (minimal_tag >= 0)
1459                           {
1460                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1461                             tnfa->minimal_tags[i] = tag;
1462                             tnfa->minimal_tags[i + 1] = minimal_tag;
1463                             tnfa->minimal_tags[i + 2] = -1;
1464                             minimal_tag = -1;
1465                             num_minimals++;
1466                           }
1467                         tre_purge_regset(regset, tnfa, tag);
1468                       }
1469
1470                     regset[0] = -1;
1471                     tag = next_tag;
1472                     num_tags++;
1473                     next_tag++;
1474                   }
1475
1476                 if (node->num_submatches > 0)
1477                   {
1478                     /* The next two tags are reserved for markers. */
1479                     next_tag++;
1480                     tag = next_tag;
1481                     next_tag++;
1482                   }
1483
1484                 break;
1485               }
1486             }
1487
1488           if (node->submatch_id >= 0)
1489             {
1490               int i;
1491               /* Push this submatch on the parents stack. */
1492               for (i = 0; parents[i] >= 0; i++);
1493               parents[i] = node->submatch_id;
1494               parents[i + 1] = -1;
1495             }
1496
1497           break; /* end case: ADDTAGS_RECURSE */
1498
1499         case ADDTAGS_AFTER_ITERATION:
1500           {
1501             int minimal = 0;
1502             int enter_tag;
1503             node = tre_stack_pop_voidptr(stack);
1504             if (first_pass)
1505               {
1506                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1507                   + tre_stack_pop_int(stack);
1508                 minimal_tag = -1;
1509               }
1510             else
1511               {
1512                 minimal = tre_stack_pop_int(stack);
1513                 enter_tag = tre_stack_pop_int(stack);
1514                 if (minimal)
1515                   minimal_tag = enter_tag;
1516               }
1517
1518             if (!first_pass)
1519               {
1520                 if (minimal)
1521                   direction = TRE_TAG_MINIMIZE;
1522                 else
1523                   direction = TRE_TAG_MAXIMIZE;
1524               }
1525             break;
1526           }
1527
1528         case ADDTAGS_AFTER_CAT_LEFT:
1529           {
1530             int new_tag = tre_stack_pop_int(stack);
1531             next_tag = tre_stack_pop_int(stack);
1532             if (new_tag >= 0)
1533               {
1534                 tag = new_tag;
1535               }
1536             break;
1537           }
1538
1539         case ADDTAGS_AFTER_CAT_RIGHT:
1540           node = tre_stack_pop_voidptr(stack);
1541           if (first_pass)
1542             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1543               + ((tre_catenation_t *)node->obj)->right->num_tags;
1544           break;
1545
1546         case ADDTAGS_AFTER_UNION_LEFT:
1547           /* Lift the bottom of the `regset' array so that when processing
1548              the right operand the items currently in the array are
1549              invisible.  The original bottom was saved at ADDTAGS_UNION and
1550              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1551           while (*regset >= 0)
1552             regset++;
1553           break;
1554
1555         case ADDTAGS_AFTER_UNION_RIGHT:
1556           {
1557             int added_tags, tag_left, tag_right;
1558             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1559             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1560             node = tre_stack_pop_voidptr(stack);
1561             added_tags = tre_stack_pop_int(stack);
1562             if (first_pass)
1563               {
1564                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1565                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1566                   + ((node->num_submatches > 0) ? 2 : 0);
1567               }
1568             regset = tre_stack_pop_voidptr(stack);
1569             tag_left = tre_stack_pop_int(stack);
1570             tag_right = tre_stack_pop_int(stack);
1571
1572             /* Add tags after both children, the left child gets a smaller
1573                tag than the right child.  This guarantees that we prefer
1574                the left child over the right child. */
1575             /* XXX - This is not always necessary (if the children have
1576                tags which must be seen for every match of that child). */
1577             /* XXX - Check if this is the only place where tre_add_tag_right
1578                is used.  If so, use tre_add_tag_left (putting the tag before
1579                the child as opposed after the child) and throw away
1580                tre_add_tag_right. */
1581             if (node->num_submatches > 0)
1582               {
1583                 if (!first_pass)
1584                   {
1585                     status = tre_add_tag_right(mem, left, tag_left);
1586                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1587                     status = tre_add_tag_right(mem, right, tag_right);
1588                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1589                   }
1590                 num_tags += 2;
1591               }
1592             direction = TRE_TAG_MAXIMIZE;
1593             break;
1594           }
1595
1596         default:
1597           assert(0);
1598           break;
1599
1600         } /* end switch(symbol) */
1601     } /* end while(tre_stack_num_objects(stack) > bottom) */
1602
1603   if (!first_pass)
1604     tre_purge_regset(regset, tnfa, tag);
1605
1606   if (!first_pass && minimal_tag >= 0)
1607     {
1608       int i;
1609       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1610       tnfa->minimal_tags[i] = tag;
1611       tnfa->minimal_tags[i + 1] = minimal_tag;
1612       tnfa->minimal_tags[i + 2] = -1;
1613       minimal_tag = -1;
1614       num_minimals++;
1615     }
1616
1617   assert(tree->num_tags == num_tags);
1618   tnfa->end_tag = num_tags;
1619   tnfa->num_tags = num_tags;
1620   tnfa->num_minimals = num_minimals;
1621   xfree(orig_regset);
1622   xfree(parents);
1623   xfree(saved_states);
1624   return status;
1625 }
1626
1627
1628
1629 /*
1630   AST to TNFA compilation routines.
1631 */
1632
1633 typedef enum {
1634   COPY_RECURSE,
1635   COPY_SET_RESULT_PTR
1636 } tre_copyast_symbol_t;
1637
1638 /* Flags for tre_copy_ast(). */
1639 #define COPY_REMOVE_TAGS         1
1640 #define COPY_MAXIMIZE_FIRST_TAG  2
1641
1642 static reg_errcode_t
1643 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1644              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1645              tre_ast_node_t **copy, int *max_pos)
1646 {
1647   reg_errcode_t status = REG_OK;
1648   int bottom = tre_stack_num_objects(stack);
1649   int num_copied = 0;
1650   int first_tag = 1;
1651   tre_ast_node_t **result = copy;
1652   tre_copyast_symbol_t symbol;
1653
1654   STACK_PUSH(stack, voidptr, ast);
1655   STACK_PUSH(stack, int, COPY_RECURSE);
1656
1657   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1658     {
1659       tre_ast_node_t *node;
1660       if (status != REG_OK)
1661         break;
1662
1663       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1664       switch (symbol)
1665         {
1666         case COPY_SET_RESULT_PTR:
1667           result = tre_stack_pop_voidptr(stack);
1668           break;
1669         case COPY_RECURSE:
1670           node = tre_stack_pop_voidptr(stack);
1671           switch (node->type)
1672             {
1673             case LITERAL:
1674               {
1675                 tre_literal_t *lit = node->obj;
1676                 int pos = lit->position;
1677                 int min = lit->code_min;
1678                 int max = lit->code_max;
1679                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1680                   {
1681                     /* XXX - e.g. [ab] has only one position but two
1682                        nodes, so we are creating holes in the state space
1683                        here.  Not fatal, just wastes memory. */
1684                     pos += *pos_add;
1685                     num_copied++;
1686                   }
1687                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1688                   {
1689                     /* Change this tag to empty. */
1690                     min = EMPTY;
1691                     max = pos = -1;
1692                   }
1693                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1694                          && first_tag)
1695                   {
1696                     /* Maximize the first tag. */
1697                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1698                     first_tag = 0;
1699                   }
1700                 *result = tre_ast_new_literal(mem, min, max, pos);
1701                 if (*result == NULL)
1702                   status = REG_ESPACE;
1703                 else {
1704                   tre_literal_t *p = (*result)->obj;
1705                   p->class = lit->class;
1706                   p->neg_classes = lit->neg_classes;
1707                 }
1708
1709                 if (pos > *max_pos)
1710                   *max_pos = pos;
1711                 break;
1712               }
1713             case UNION:
1714               {
1715                 tre_union_t *uni = node->obj;
1716                 tre_union_t *tmp;
1717                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1718                 if (*result == NULL)
1719                   {
1720                     status = REG_ESPACE;
1721                     break;
1722                   }
1723                 tmp = (*result)->obj;
1724                 result = &tmp->left;
1725                 STACK_PUSHX(stack, voidptr, uni->right);
1726                 STACK_PUSHX(stack, int, COPY_RECURSE);
1727                 STACK_PUSHX(stack, voidptr, &tmp->right);
1728                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1729                 STACK_PUSHX(stack, voidptr, uni->left);
1730                 STACK_PUSHX(stack, int, COPY_RECURSE);
1731                 break;
1732               }
1733             case CATENATION:
1734               {
1735                 tre_catenation_t *cat = node->obj;
1736                 tre_catenation_t *tmp;
1737                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1738                 if (*result == NULL)
1739                   {
1740                     status = REG_ESPACE;
1741                     break;
1742                   }
1743                 tmp = (*result)->obj;
1744                 tmp->left = NULL;
1745                 tmp->right = NULL;
1746                 result = &tmp->left;
1747
1748                 STACK_PUSHX(stack, voidptr, cat->right);
1749                 STACK_PUSHX(stack, int, COPY_RECURSE);
1750                 STACK_PUSHX(stack, voidptr, &tmp->right);
1751                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1752                 STACK_PUSHX(stack, voidptr, cat->left);
1753                 STACK_PUSHX(stack, int, COPY_RECURSE);
1754                 break;
1755               }
1756             case ITERATION:
1757               {
1758                 tre_iteration_t *iter = node->obj;
1759                 STACK_PUSHX(stack, voidptr, iter->arg);
1760                 STACK_PUSHX(stack, int, COPY_RECURSE);
1761                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1762                                            iter->max, iter->minimal);
1763                 if (*result == NULL)
1764                   {
1765                     status = REG_ESPACE;
1766                     break;
1767                   }
1768                 iter = (*result)->obj;
1769                 result = &iter->arg;
1770                 break;
1771               }
1772             default:
1773               assert(0);
1774               break;
1775             }
1776           break;
1777         }
1778     }
1779   *pos_add += num_copied;
1780   return status;
1781 }
1782
1783 typedef enum {
1784   EXPAND_RECURSE,
1785   EXPAND_AFTER_ITER
1786 } tre_expand_ast_symbol_t;
1787
1788 /* Expands each iteration node that has a finite nonzero minimum or maximum
1789    iteration count to a catenated sequence of copies of the node. */
1790 static reg_errcode_t
1791 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1792                int *position, tre_tag_direction_t *tag_directions)
1793 {
1794   reg_errcode_t status = REG_OK;
1795   int bottom = tre_stack_num_objects(stack);
1796   int pos_add = 0;
1797   int pos_add_total = 0;
1798   int max_pos = 0;
1799   int iter_depth = 0;
1800
1801   STACK_PUSHR(stack, voidptr, ast);
1802   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1803   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1804     {
1805       tre_ast_node_t *node;
1806       tre_expand_ast_symbol_t symbol;
1807
1808       if (status != REG_OK)
1809         break;
1810
1811       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1812       node = tre_stack_pop_voidptr(stack);
1813       switch (symbol)
1814         {
1815         case EXPAND_RECURSE:
1816           switch (node->type)
1817             {
1818             case LITERAL:
1819               {
1820                 tre_literal_t *lit= node->obj;
1821                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1822                   {
1823                     lit->position += pos_add;
1824                     if (lit->position > max_pos)
1825                       max_pos = lit->position;
1826                   }
1827                 break;
1828               }
1829             case UNION:
1830               {
1831                 tre_union_t *uni = node->obj;
1832                 STACK_PUSHX(stack, voidptr, uni->right);
1833                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1834                 STACK_PUSHX(stack, voidptr, uni->left);
1835                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1836                 break;
1837               }
1838             case CATENATION:
1839               {
1840                 tre_catenation_t *cat = node->obj;
1841                 STACK_PUSHX(stack, voidptr, cat->right);
1842                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1843                 STACK_PUSHX(stack, voidptr, cat->left);
1844                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1845                 break;
1846               }
1847             case ITERATION:
1848               {
1849                 tre_iteration_t *iter = node->obj;
1850                 STACK_PUSHX(stack, int, pos_add);
1851                 STACK_PUSHX(stack, voidptr, node);
1852                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1853                 STACK_PUSHX(stack, voidptr, iter->arg);
1854                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1855                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1856                    then don't increase the `pos' fields of the nodes now, it
1857                    will get done when expanding. */
1858                 if (iter->min > 1 || iter->max > 1)
1859                   pos_add = 0;
1860                 iter_depth++;
1861                 break;
1862               }
1863             default:
1864               assert(0);
1865               break;
1866             }
1867           break;
1868         case EXPAND_AFTER_ITER:
1869           {
1870             tre_iteration_t *iter = node->obj;
1871             int pos_add_last;
1872             pos_add = tre_stack_pop_int(stack);
1873             pos_add_last = pos_add;
1874             if (iter->min > 1 || iter->max > 1)
1875               {
1876                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1877                 int j;
1878                 int pos_add_save = pos_add;
1879
1880                 /* Create a catenated sequence of copies of the node. */
1881                 for (j = 0; j < iter->min; j++)
1882                   {
1883                     tre_ast_node_t *copy;
1884                     /* Remove tags from all but the last copy. */
1885                     int flags = ((j + 1 < iter->min)
1886                                  ? COPY_REMOVE_TAGS
1887                                  : COPY_MAXIMIZE_FIRST_TAG);
1888                     pos_add_save = pos_add;
1889                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1890                                           &pos_add, tag_directions, &copy,
1891                                           &max_pos);
1892                     if (status != REG_OK)
1893                       return status;
1894                     if (seq1 != NULL)
1895                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1896                     else
1897                       seq1 = copy;
1898                     if (seq1 == NULL)
1899                       return REG_ESPACE;
1900                   }
1901
1902                 if (iter->max == -1)
1903                   {
1904                     /* No upper limit. */
1905                     pos_add_save = pos_add;
1906                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1907                                           &pos_add, NULL, &seq2, &max_pos);
1908                     if (status != REG_OK)
1909                       return status;
1910                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1911                     if (seq2 == NULL)
1912                       return REG_ESPACE;
1913                   }
1914                 else
1915                   {
1916                     for (j = iter->min; j < iter->max; j++)
1917                       {
1918                         tre_ast_node_t *tmp, *copy;
1919                         pos_add_save = pos_add;
1920                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1921                                               &pos_add, NULL, &copy, &max_pos);
1922                         if (status != REG_OK)
1923                           return status;
1924                         if (seq2 != NULL)
1925                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1926                         else
1927                           seq2 = copy;
1928                         if (seq2 == NULL)
1929                           return REG_ESPACE;
1930                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1931                         if (tmp == NULL)
1932                           return REG_ESPACE;
1933                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1934                         if (seq2 == NULL)
1935                           return REG_ESPACE;
1936                       }
1937                   }
1938
1939                 pos_add = pos_add_save;
1940                 if (seq1 == NULL)
1941                   seq1 = seq2;
1942                 else if (seq2 != NULL)
1943                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1944                 if (seq1 == NULL)
1945                   return REG_ESPACE;
1946                 node->obj = seq1->obj;
1947                 node->type = seq1->type;
1948               }
1949
1950             iter_depth--;
1951             pos_add_total += pos_add - pos_add_last;
1952             if (iter_depth == 0)
1953               pos_add = pos_add_total;
1954
1955             break;
1956           }
1957         default:
1958           assert(0);
1959           break;
1960         }
1961     }
1962
1963   *position += pos_add_total;
1964
1965   /* `max_pos' should never be larger than `*position' if the above
1966      code works, but just an extra safeguard let's make sure
1967      `*position' is set large enough so enough memory will be
1968      allocated for the transition table. */
1969   if (max_pos > *position)
1970     *position = max_pos;
1971
1972   return status;
1973 }
1974
1975 static tre_pos_and_tags_t *
1976 tre_set_empty(tre_mem_t mem)
1977 {
1978   tre_pos_and_tags_t *new_set;
1979
1980   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1981   if (new_set == NULL)
1982     return NULL;
1983
1984   new_set[0].position = -1;
1985   new_set[0].code_min = -1;
1986   new_set[0].code_max = -1;
1987
1988   return new_set;
1989 }
1990
1991 static tre_pos_and_tags_t *
1992 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1993             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1994 {
1995   tre_pos_and_tags_t *new_set;
1996
1997   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1998   if (new_set == NULL)
1999     return NULL;
2000
2001   new_set[0].position = position;
2002   new_set[0].code_min = code_min;
2003   new_set[0].code_max = code_max;
2004   new_set[0].class = class;
2005   new_set[0].neg_classes = neg_classes;
2006   new_set[0].backref = backref;
2007   new_set[1].position = -1;
2008   new_set[1].code_min = -1;
2009   new_set[1].code_max = -1;
2010
2011   return new_set;
2012 }
2013
2014 static tre_pos_and_tags_t *
2015 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2016               int *tags, int assertions)
2017 {
2018   int s1, s2, i, j;
2019   tre_pos_and_tags_t *new_set;
2020   int *new_tags;
2021   int num_tags;
2022
2023   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2024   for (s1 = 0; set1[s1].position >= 0; s1++);
2025   for (s2 = 0; set2[s2].position >= 0; s2++);
2026   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2027   if (!new_set )
2028     return NULL;
2029
2030   for (s1 = 0; set1[s1].position >= 0; s1++)
2031     {
2032       new_set[s1].position = set1[s1].position;
2033       new_set[s1].code_min = set1[s1].code_min;
2034       new_set[s1].code_max = set1[s1].code_max;
2035       new_set[s1].assertions = set1[s1].assertions | assertions;
2036       new_set[s1].class = set1[s1].class;
2037       new_set[s1].neg_classes = set1[s1].neg_classes;
2038       new_set[s1].backref = set1[s1].backref;
2039       if (set1[s1].tags == NULL && tags == NULL)
2040         new_set[s1].tags = NULL;
2041       else
2042         {
2043           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2044           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2045                                          * (i + num_tags + 1)));
2046           if (new_tags == NULL)
2047             return NULL;
2048           for (j = 0; j < i; j++)
2049             new_tags[j] = set1[s1].tags[j];
2050           for (i = 0; i < num_tags; i++)
2051             new_tags[j + i] = tags[i];
2052           new_tags[j + i] = -1;
2053           new_set[s1].tags = new_tags;
2054         }
2055     }
2056
2057   for (s2 = 0; set2[s2].position >= 0; s2++)
2058     {
2059       new_set[s1 + s2].position = set2[s2].position;
2060       new_set[s1 + s2].code_min = set2[s2].code_min;
2061       new_set[s1 + s2].code_max = set2[s2].code_max;
2062       /* XXX - why not | assertions here as well? */
2063       new_set[s1 + s2].assertions = set2[s2].assertions;
2064       new_set[s1 + s2].class = set2[s2].class;
2065       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2066       new_set[s1 + s2].backref = set2[s2].backref;
2067       if (set2[s2].tags == NULL)
2068         new_set[s1 + s2].tags = NULL;
2069       else
2070         {
2071           for (i = 0; set2[s2].tags[i] >= 0; i++);
2072           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2073           if (new_tags == NULL)
2074             return NULL;
2075           for (j = 0; j < i; j++)
2076             new_tags[j] = set2[s2].tags[j];
2077           new_tags[j] = -1;
2078           new_set[s1 + s2].tags = new_tags;
2079         }
2080     }
2081   new_set[s1 + s2].position = -1;
2082   return new_set;
2083 }
2084
2085 /* Finds the empty path through `node' which is the one that should be
2086    taken according to POSIX.2 rules, and adds the tags on that path to
2087    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2088    set to the number of tags seen on the path. */
2089 static reg_errcode_t
2090 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2091                 int *assertions, int *num_tags_seen)
2092 {
2093   tre_literal_t *lit;
2094   tre_union_t *uni;
2095   tre_catenation_t *cat;
2096   tre_iteration_t *iter;
2097   int i;
2098   int bottom = tre_stack_num_objects(stack);
2099   reg_errcode_t status = REG_OK;
2100   if (num_tags_seen)
2101     *num_tags_seen = 0;
2102
2103   status = tre_stack_push_voidptr(stack, node);
2104
2105   /* Walk through the tree recursively. */
2106   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2107     {
2108       node = tre_stack_pop_voidptr(stack);
2109
2110       switch (node->type)
2111         {
2112         case LITERAL:
2113           lit = (tre_literal_t *)node->obj;
2114           switch (lit->code_min)
2115             {
2116             case TAG:
2117               if (lit->code_max >= 0)
2118                 {
2119                   if (tags != NULL)
2120                     {
2121                       /* Add the tag to `tags'. */
2122                       for (i = 0; tags[i] >= 0; i++)
2123                         if (tags[i] == lit->code_max)
2124                           break;
2125                       if (tags[i] < 0)
2126                         {
2127                           tags[i] = lit->code_max;
2128                           tags[i + 1] = -1;
2129                         }
2130                     }
2131                   if (num_tags_seen)
2132                     (*num_tags_seen)++;
2133                 }
2134               break;
2135             case ASSERTION:
2136               assert(lit->code_max >= 1
2137                      || lit->code_max <= ASSERT_LAST);
2138               if (assertions != NULL)
2139                 *assertions |= lit->code_max;
2140               break;
2141             case EMPTY:
2142               break;
2143             default:
2144               assert(0);
2145               break;
2146             }
2147           break;
2148
2149         case UNION:
2150           /* Subexpressions starting earlier take priority over ones
2151              starting later, so we prefer the left subexpression over the
2152              right subexpression. */
2153           uni = (tre_union_t *)node->obj;
2154           if (uni->left->nullable)
2155             STACK_PUSHX(stack, voidptr, uni->left)
2156           else if (uni->right->nullable)
2157             STACK_PUSHX(stack, voidptr, uni->right)
2158           else
2159             assert(0);
2160           break;
2161
2162         case CATENATION:
2163           /* The path must go through both children. */
2164           cat = (tre_catenation_t *)node->obj;
2165           assert(cat->left->nullable);
2166           assert(cat->right->nullable);
2167           STACK_PUSHX(stack, voidptr, cat->left);
2168           STACK_PUSHX(stack, voidptr, cat->right);
2169           break;
2170
2171         case ITERATION:
2172           /* A match with an empty string is preferred over no match at
2173              all, so we go through the argument if possible. */
2174           iter = (tre_iteration_t *)node->obj;
2175           if (iter->arg->nullable)
2176             STACK_PUSHX(stack, voidptr, iter->arg);
2177           break;
2178
2179         default:
2180           assert(0);
2181           break;
2182         }
2183     }
2184
2185   return status;
2186 }
2187
2188
2189 typedef enum {
2190   NFL_RECURSE,
2191   NFL_POST_UNION,
2192   NFL_POST_CATENATION,
2193   NFL_POST_ITERATION
2194 } tre_nfl_stack_symbol_t;
2195
2196
2197 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2198    the nodes of the AST `tree'. */
2199 static reg_errcode_t
2200 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2201 {
2202   int bottom = tre_stack_num_objects(stack);
2203
2204   STACK_PUSHR(stack, voidptr, tree);
2205   STACK_PUSHR(stack, int, NFL_RECURSE);
2206
2207   while (tre_stack_num_objects(stack) > bottom)
2208     {
2209       tre_nfl_stack_symbol_t symbol;
2210       tre_ast_node_t *node;
2211
2212       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2213       node = tre_stack_pop_voidptr(stack);
2214       switch (symbol)
2215         {
2216         case NFL_RECURSE:
2217           switch (node->type)
2218             {
2219             case LITERAL:
2220               {
2221                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2222                 if (IS_BACKREF(lit))
2223                   {
2224                     /* Back references: nullable = false, firstpos = {i},
2225                        lastpos = {i}. */
2226                     node->nullable = 0;
2227                     node->firstpos = tre_set_one(mem, lit->position, 0,
2228                                              TRE_CHAR_MAX, 0, NULL, -1);
2229                     if (!node->firstpos)
2230                       return REG_ESPACE;
2231                     node->lastpos = tre_set_one(mem, lit->position, 0,
2232                                                 TRE_CHAR_MAX, 0, NULL,
2233                                                 (int)lit->code_max);
2234                     if (!node->lastpos)
2235                       return REG_ESPACE;
2236                   }
2237                 else if (lit->code_min < 0)
2238                   {
2239                     /* Tags, empty strings, params, and zero width assertions:
2240                        nullable = true, firstpos = {}, and lastpos = {}. */
2241                     node->nullable = 1;
2242                     node->firstpos = tre_set_empty(mem);
2243                     if (!node->firstpos)
2244                       return REG_ESPACE;
2245                     node->lastpos = tre_set_empty(mem);
2246                     if (!node->lastpos)
2247                       return REG_ESPACE;
2248                   }
2249                 else
2250                   {
2251                     /* Literal at position i: nullable = false, firstpos = {i},
2252                        lastpos = {i}. */
2253                     node->nullable = 0;
2254                     node->firstpos =
2255                       tre_set_one(mem, lit->position, (int)lit->code_min,
2256                                   (int)lit->code_max, 0, NULL, -1);
2257                     if (!node->firstpos)
2258                       return REG_ESPACE;
2259                     node->lastpos = tre_set_one(mem, lit->position,
2260                                                 (int)lit->code_min,
2261                                                 (int)lit->code_max,
2262                                                 lit->class, lit->neg_classes,
2263                                                 -1);
2264                     if (!node->lastpos)
2265                       return REG_ESPACE;
2266                   }
2267                 break;
2268               }
2269
2270             case UNION:
2271               /* Compute the attributes for the two subtrees, and after that
2272                  for this node. */
2273               STACK_PUSHR(stack, voidptr, node);
2274               STACK_PUSHR(stack, int, NFL_POST_UNION);
2275               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2276               STACK_PUSHR(stack, int, NFL_RECURSE);
2277               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2278               STACK_PUSHR(stack, int, NFL_RECURSE);
2279               break;
2280
2281             case CATENATION:
2282               /* Compute the attributes for the two subtrees, and after that
2283                  for this node. */
2284               STACK_PUSHR(stack, voidptr, node);
2285               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2286               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2287               STACK_PUSHR(stack, int, NFL_RECURSE);
2288               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2289               STACK_PUSHR(stack, int, NFL_RECURSE);
2290               break;
2291
2292             case ITERATION:
2293               /* Compute the attributes for the subtree, and after that for
2294                  this node. */
2295               STACK_PUSHR(stack, voidptr, node);
2296               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2297               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2298               STACK_PUSHR(stack, int, NFL_RECURSE);
2299               break;
2300             }
2301           break; /* end case: NFL_RECURSE */
2302
2303         case NFL_POST_UNION:
2304           {
2305             tre_union_t *uni = (tre_union_t *)node->obj;
2306             node->nullable = uni->left->nullable || uni->right->nullable;
2307             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2308                                            uni->right->firstpos, NULL, 0);
2309             if (!node->firstpos)
2310               return REG_ESPACE;
2311             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2312                                           uni->right->lastpos, NULL, 0);
2313             if (!node->lastpos)
2314               return REG_ESPACE;
2315             break;
2316           }
2317
2318         case NFL_POST_ITERATION:
2319           {
2320             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2321
2322             if (iter->min == 0 || iter->arg->nullable)
2323               node->nullable = 1;
2324             else
2325               node->nullable = 0;
2326             node->firstpos = iter->arg->firstpos;
2327             node->lastpos = iter->arg->lastpos;
2328             break;
2329           }
2330
2331         case NFL_POST_CATENATION:
2332           {
2333             int num_tags, *tags, assertions;
2334             reg_errcode_t status;
2335             tre_catenation_t *cat = node->obj;
2336             node->nullable = cat->left->nullable && cat->right->nullable;
2337
2338             /* Compute firstpos. */
2339             if (cat->left->nullable)
2340               {
2341                 /* The left side matches the empty string.  Make a first pass
2342                    with tre_match_empty() to get the number of tags and
2343                    parameters. */
2344                 status = tre_match_empty(stack, cat->left,
2345                                          NULL, NULL, &num_tags);
2346                 if (status != REG_OK)
2347                   return status;
2348                 /* Allocate arrays for the tags and parameters. */
2349                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2350                 if (!tags)
2351                   return REG_ESPACE;
2352                 tags[0] = -1;
2353                 assertions = 0;
2354                 /* Second pass with tre_mach_empty() to get the list of
2355                    tags and parameters. */
2356                 status = tre_match_empty(stack, cat->left, tags,
2357                                          &assertions, NULL);
2358                 if (status != REG_OK)
2359                   {
2360                     xfree(tags);
2361                     return status;
2362                   }
2363                 node->firstpos =
2364                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2365                                 tags, assertions);
2366                 xfree(tags);
2367                 if (!node->firstpos)
2368                   return REG_ESPACE;
2369               }
2370             else
2371               {
2372                 node->firstpos = cat->left->firstpos;
2373               }
2374
2375             /* Compute lastpos. */
2376             if (cat->right->nullable)
2377               {
2378                 /* The right side matches the empty string.  Make a first pass
2379                    with tre_match_empty() to get the number of tags and
2380                    parameters. */
2381                 status = tre_match_empty(stack, cat->right,
2382                                          NULL, NULL, &num_tags);
2383                 if (status != REG_OK)
2384                   return status;
2385                 /* Allocate arrays for the tags and parameters. */
2386                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2387                 if (!tags)
2388                   return REG_ESPACE;
2389                 tags[0] = -1;
2390                 assertions = 0;
2391                 /* Second pass with tre_mach_empty() to get the list of
2392                    tags and parameters. */
2393                 status = tre_match_empty(stack, cat->right, tags,
2394                                          &assertions, NULL);
2395                 if (status != REG_OK)
2396                   {
2397                     xfree(tags);
2398                     return status;
2399                   }
2400                 node->lastpos =
2401                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2402                                 tags, assertions);
2403                 xfree(tags);
2404                 if (!node->lastpos)
2405                   return REG_ESPACE;
2406               }
2407             else
2408               {
2409                 node->lastpos = cat->right->lastpos;
2410               }
2411             break;
2412           }
2413
2414         default:
2415           assert(0);
2416           break;
2417         }
2418     }
2419
2420   return REG_OK;
2421 }
2422
2423
2424 /* Adds a transition from each position in `p1' to each position in `p2'. */
2425 static reg_errcode_t
2426 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2427                tre_tnfa_transition_t *transitions,
2428                int *counts, int *offs)
2429 {
2430   tre_pos_and_tags_t *orig_p2 = p2;
2431   tre_tnfa_transition_t *trans;
2432   int i, j, k, l, dup, prev_p2_pos;
2433
2434   if (transitions != NULL)
2435     while (p1->position >= 0)
2436       {
2437         p2 = orig_p2;
2438         prev_p2_pos = -1;
2439         while (p2->position >= 0)
2440           {
2441             /* Optimization: if this position was already handled, skip it. */
2442             if (p2->position == prev_p2_pos)
2443               {
2444                 p2++;
2445                 continue;
2446               }
2447             prev_p2_pos = p2->position;
2448             /* Set `trans' to point to the next unused transition from
2449                position `p1->position'. */
2450             trans = transitions + offs[p1->position];
2451             while (trans->state != NULL)
2452               {
2453 #if 0
2454                 /* If we find a previous transition from `p1->position' to
2455                    `p2->position', it is overwritten.  This can happen only
2456                    if there are nested loops in the regexp, like in "((a)*)*".
2457                    In POSIX.2 repetition using the outer loop is always
2458                    preferred over using the inner loop.  Therefore the
2459                    transition for the inner loop is useless and can be thrown
2460                    away. */
2461                 /* XXX - The same position is used for all nodes in a bracket
2462                    expression, so this optimization cannot be used (it will
2463                    break bracket expressions) unless I figure out a way to
2464                    detect it here. */
2465                 if (trans->state_id == p2->position)
2466                   {
2467                     break;
2468                   }
2469 #endif
2470                 trans++;
2471               }
2472
2473             if (trans->state == NULL)
2474               (trans + 1)->state = NULL;
2475             /* Use the character ranges, assertions, etc. from `p1' for
2476                the transition from `p1' to `p2'. */
2477             trans->code_min = p1->code_min;
2478             trans->code_max = p1->code_max;
2479             trans->state = transitions + offs[p2->position];
2480             trans->state_id = p2->position;
2481             trans->assertions = p1->assertions | p2->assertions
2482               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2483               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2484             if (p1->backref >= 0)
2485               {
2486                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2487                 assert(p2->backref < 0);
2488                 trans->u.backref = p1->backref;
2489                 trans->assertions |= ASSERT_BACKREF;
2490               }
2491             else
2492               trans->u.class = p1->class;
2493             if (p1->neg_classes != NULL)
2494               {
2495                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2496                 trans->neg_classes =
2497                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2498                 if (trans->neg_classes == NULL)
2499                   return REG_ESPACE;
2500                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2501                   trans->neg_classes[i] = p1->neg_classes[i];
2502                 trans->neg_classes[i] = (tre_ctype_t)0;
2503               }
2504             else
2505               trans->neg_classes = NULL;
2506
2507             /* Find out how many tags this transition has. */
2508             i = 0;
2509             if (p1->tags != NULL)
2510               while(p1->tags[i] >= 0)
2511                 i++;
2512             j = 0;
2513             if (p2->tags != NULL)
2514               while(p2->tags[j] >= 0)
2515                 j++;
2516
2517             /* If we are overwriting a transition, free the old tag array. */
2518             if (trans->tags != NULL)
2519               xfree(trans->tags);
2520             trans->tags = NULL;
2521
2522             /* If there were any tags, allocate an array and fill it. */
2523             if (i + j > 0)
2524               {
2525                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2526                 if (!trans->tags)
2527                   return REG_ESPACE;
2528                 i = 0;
2529                 if (p1->tags != NULL)
2530                   while(p1->tags[i] >= 0)
2531                     {
2532                       trans->tags[i] = p1->tags[i];
2533                       i++;
2534                     }
2535                 l = i;
2536                 j = 0;
2537                 if (p2->tags != NULL)
2538                   while (p2->tags[j] >= 0)
2539                     {
2540                       /* Don't add duplicates. */
2541                       dup = 0;
2542                       for (k = 0; k < i; k++)
2543                         if (trans->tags[k] == p2->tags[j])
2544                           {
2545                             dup = 1;
2546                             break;
2547                           }
2548                       if (!dup)
2549                         trans->tags[l++] = p2->tags[j];
2550                       j++;
2551                     }
2552                 trans->tags[l] = -1;
2553               }
2554
2555             p2++;
2556           }
2557         p1++;
2558       }
2559   else
2560     /* Compute a maximum limit for the number of transitions leaving
2561        from each state. */
2562     while (p1->position >= 0)
2563       {
2564         p2 = orig_p2;
2565         while (p2->position >= 0)
2566           {
2567             counts[p1->position]++;
2568             p2++;
2569           }
2570         p1++;
2571       }
2572   return REG_OK;
2573 }
2574
2575 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2576    labelled with one character range (there are no transitions on empty
2577    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2578    the regexp. */
2579 static reg_errcode_t
2580 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2581                 int *counts, int *offs)
2582 {
2583   tre_union_t *uni;
2584   tre_catenation_t *cat;
2585   tre_iteration_t *iter;
2586   reg_errcode_t errcode = REG_OK;
2587
2588   /* XXX - recurse using a stack!. */
2589   switch (node->type)
2590     {
2591     case LITERAL:
2592       break;
2593     case UNION:
2594       uni = (tre_union_t *)node->obj;
2595       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2596       if (errcode != REG_OK)
2597         return errcode;
2598       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2599       break;
2600
2601     case CATENATION:
2602       cat = (tre_catenation_t *)node->obj;
2603       /* Add a transition from each position in cat->left->lastpos
2604          to each position in cat->right->firstpos. */
2605       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2606                                transitions, counts, offs);
2607       if (errcode != REG_OK)
2608         return errcode;
2609       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2610       if (errcode != REG_OK)
2611         return errcode;
2612       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2613       break;
2614
2615     case ITERATION:
2616       iter = (tre_iteration_t *)node->obj;
2617       assert(iter->max == -1 || iter->max == 1);
2618
2619       if (iter->max == -1)
2620         {
2621           assert(iter->min == 0 || iter->min == 1);
2622           /* Add a transition from each last position in the iterated
2623              expression to each first position. */
2624           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2625                                    transitions, counts, offs);
2626           if (errcode != REG_OK)
2627             return errcode;
2628         }
2629       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2630       break;
2631     }
2632   return errcode;
2633 }
2634
2635
2636 #define ERROR_EXIT(err)           \
2637   do                              \
2638     {                             \
2639       errcode = err;              \
2640       if (/*CONSTCOND*/1)         \
2641         goto error_exit;          \
2642     }                             \
2643  while (/*CONSTCOND*/0)
2644
2645
2646 int
2647 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2648 {
2649   tre_stack_t *stack;
2650   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2651   tre_pos_and_tags_t *p;
2652   int *counts = NULL, *offs = NULL;
2653   int i, add = 0;
2654   tre_tnfa_transition_t *transitions, *initial;
2655   tre_tnfa_t *tnfa = NULL;
2656   tre_submatch_data_t *submatch_data;
2657   tre_tag_direction_t *tag_directions = NULL;
2658   reg_errcode_t errcode;
2659   tre_mem_t mem;
2660
2661   /* Parse context. */
2662   tre_parse_ctx_t parse_ctx;
2663
2664   /* Allocate a stack used throughout the compilation process for various
2665      purposes. */
2666   stack = tre_stack_new(512, 10240, 128);
2667   if (!stack)
2668     return REG_ESPACE;
2669   /* Allocate a fast memory allocator. */
2670   mem = tre_mem_new();
2671   if (!mem)
2672     {
2673       tre_stack_destroy(stack);
2674       return REG_ESPACE;
2675     }
2676
2677   /* Parse the regexp. */
2678   memset(&parse_ctx, 0, sizeof(parse_ctx));
2679   parse_ctx.mem = mem;
2680   parse_ctx.stack = stack;
2681   parse_ctx.re = regex;
2682   parse_ctx.cflags = cflags;
2683   parse_ctx.max_backref = -1;
2684   errcode = tre_parse(&parse_ctx);
2685   if (errcode != REG_OK)
2686     ERROR_EXIT(errcode);
2687   preg->re_nsub = parse_ctx.submatch_id - 1;
2688   tree = parse_ctx.n;
2689
2690 #ifdef TRE_DEBUG
2691   tre_ast_print(tree);
2692 #endif /* TRE_DEBUG */
2693
2694   /* Referring to nonexistent subexpressions is illegal. */
2695   if (parse_ctx.max_backref > (int)preg->re_nsub)
2696     ERROR_EXIT(REG_ESUBREG);
2697
2698   /* Allocate the TNFA struct. */
2699   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2700   if (tnfa == NULL)
2701     ERROR_EXIT(REG_ESPACE);
2702   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2703   tnfa->have_approx = 0;
2704   tnfa->num_submatches = parse_ctx.submatch_id;
2705
2706   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2707      regexp does not have back references, this can be skipped. */
2708   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2709     {
2710
2711       /* Figure out how many tags we will need. */
2712       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2713       if (errcode != REG_OK)
2714         ERROR_EXIT(errcode);
2715
2716       if (tnfa->num_tags > 0)
2717         {
2718           tag_directions = xmalloc(sizeof(*tag_directions)
2719                                    * (tnfa->num_tags + 1));
2720           if (tag_directions == NULL)
2721             ERROR_EXIT(REG_ESPACE);
2722           tnfa->tag_directions = tag_directions;
2723           memset(tag_directions, -1,
2724                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2725         }
2726       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2727                                    sizeof(*tnfa->minimal_tags));
2728       if (tnfa->minimal_tags == NULL)
2729         ERROR_EXIT(REG_ESPACE);
2730
2731       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2732                               sizeof(*submatch_data));
2733       if (submatch_data == NULL)
2734         ERROR_EXIT(REG_ESPACE);
2735       tnfa->submatch_data = submatch_data;
2736
2737       errcode = tre_add_tags(mem, stack, tree, tnfa);
2738       if (errcode != REG_OK)
2739         ERROR_EXIT(errcode);
2740
2741     }
2742
2743   /* Expand iteration nodes. */
2744   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2745                            tag_directions);
2746   if (errcode != REG_OK)
2747     ERROR_EXIT(errcode);
2748
2749   /* Add a dummy node for the final state.
2750      XXX - For certain patterns this dummy node can be optimized away,
2751            for example "a*" or "ab*".   Figure out a simple way to detect
2752            this possibility. */
2753   tmp_ast_l = tree;
2754   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2755   if (tmp_ast_r == NULL)
2756     ERROR_EXIT(REG_ESPACE);
2757
2758   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2759   if (tree == NULL)
2760     ERROR_EXIT(REG_ESPACE);
2761
2762   errcode = tre_compute_nfl(mem, stack, tree);
2763   if (errcode != REG_OK)
2764     ERROR_EXIT(errcode);
2765
2766   counts = xmalloc(sizeof(int) * parse_ctx.position);
2767   if (counts == NULL)
2768     ERROR_EXIT(REG_ESPACE);
2769
2770   offs = xmalloc(sizeof(int) * parse_ctx.position);
2771   if (offs == NULL)
2772     ERROR_EXIT(REG_ESPACE);
2773
2774   for (i = 0; i < parse_ctx.position; i++)
2775     counts[i] = 0;
2776   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2777
2778   add = 0;
2779   for (i = 0; i < parse_ctx.position; i++)
2780     {
2781       offs[i] = add;
2782       add += counts[i] + 1;
2783       counts[i] = 0;
2784     }
2785   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2786   if (transitions == NULL)
2787     ERROR_EXIT(REG_ESPACE);
2788   tnfa->transitions = transitions;
2789   tnfa->num_transitions = add;
2790
2791   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2792   if (errcode != REG_OK)
2793     ERROR_EXIT(errcode);
2794
2795   tnfa->firstpos_chars = NULL;
2796
2797   p = tree->firstpos;
2798   i = 0;
2799   while (p->position >= 0)
2800     {
2801       i++;
2802       p++;
2803     }
2804
2805   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2806   if (initial == NULL)
2807     ERROR_EXIT(REG_ESPACE);
2808   tnfa->initial = initial;
2809
2810   i = 0;
2811   for (p = tree->firstpos; p->position >= 0; p++)
2812     {
2813       initial[i].state = transitions + offs[p->position];
2814       initial[i].state_id = p->position;
2815       initial[i].tags = NULL;
2816       /* Copy the arrays p->tags, and p->params, they are allocated
2817          from a tre_mem object. */
2818       if (p->tags)
2819         {
2820           int j;
2821           for (j = 0; p->tags[j] >= 0; j++);
2822           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2823           if (!initial[i].tags)
2824             ERROR_EXIT(REG_ESPACE);
2825           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2826         }
2827       initial[i].assertions = p->assertions;
2828       i++;
2829     }
2830   initial[i].state = NULL;
2831
2832   tnfa->num_transitions = add;
2833   tnfa->final = transitions + offs[tree->lastpos[0].position];
2834   tnfa->num_states = parse_ctx.position;
2835   tnfa->cflags = cflags;
2836
2837   tre_mem_destroy(mem);
2838   tre_stack_destroy(stack);
2839   xfree(counts);
2840   xfree(offs);
2841
2842   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2843   return REG_OK;
2844
2845  error_exit:
2846   /* Free everything that was allocated and return the error code. */
2847   tre_mem_destroy(mem);
2848   if (stack != NULL)
2849     tre_stack_destroy(stack);
2850   if (counts != NULL)
2851     xfree(counts);
2852   if (offs != NULL)
2853     xfree(offs);
2854   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2855   regfree(preg);
2856   return errcode;
2857 }
2858
2859
2860
2861
2862 void
2863 regfree(regex_t *preg)
2864 {
2865   tre_tnfa_t *tnfa;
2866   unsigned int i;
2867   tre_tnfa_transition_t *trans;
2868
2869   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2870   if (!tnfa)
2871     return;
2872
2873   for (i = 0; i < tnfa->num_transitions; i++)
2874     if (tnfa->transitions[i].state)
2875       {
2876         if (tnfa->transitions[i].tags)
2877           xfree(tnfa->transitions[i].tags);
2878         if (tnfa->transitions[i].neg_classes)
2879           xfree(tnfa->transitions[i].neg_classes);
2880       }
2881   if (tnfa->transitions)
2882     xfree(tnfa->transitions);
2883
2884   if (tnfa->initial)
2885     {
2886       for (trans = tnfa->initial; trans->state; trans++)
2887         {
2888           if (trans->tags)
2889             xfree(trans->tags);
2890         }
2891       xfree(tnfa->initial);
2892     }
2893
2894   if (tnfa->submatch_data)
2895     {
2896       for (i = 0; i < tnfa->num_submatches; i++)
2897         if (tnfa->submatch_data[i].parents)
2898           xfree(tnfa->submatch_data[i].parents);
2899       xfree(tnfa->submatch_data);
2900     }
2901
2902   if (tnfa->tag_directions)
2903     xfree(tnfa->tag_directions);
2904   if (tnfa->firstpos_chars)
2905     xfree(tnfa->firstpos_chars);
2906   if (tnfa->minimal_tags)
2907     xfree(tnfa->minimal_tags);
2908   xfree(tnfa);
2909 }