nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return REG_BADBR;
 730
 731         if (min == 0 && max == 0)
 732                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 733         else
 734                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 735         if (!ctx->n)
 736                 return REG_ESPACE;
 737         ctx->s = s;
 738         return REG_OK;
 739 }
 740
 741 static int hexval(unsigned c)
 742 {
 743         if (c-'0'<10) return c-'0';
 744         c |= 32;
 745         if (c-'a'<6) return c-'a'+10;
 746         return -1;
 747 }
 748
 749 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 750 {
 751         if (node->submatch_id >= 0) {
 752                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 753                 if (!n)
 754                         return REG_ESPACE;
 755                 n = tre_ast_new_catenation(ctx->mem, n, node);
 756                 if (!n)
 757                         return REG_ESPACE;
 758                 n->num_submatches = node->num_submatches;
 759                 node = n;
 760         }
 761         node->submatch_id = subid;
 762         node->num_submatches++;
 763         ctx->n = node;
 764         return REG_OK;
 765 }
 766
 767 /*
 768 BRE grammar:
 769 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 770 Branch =  Atom  |  Branch Atom
 771 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 772 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 773
 774 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 775
 776 ERE grammar:
 777 Regex  =  Branch  |  Regex '|' Branch
 778 Branch =  Atom  |  Branch Atom
 779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 780 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 781
 782 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 783 */
 784
 785 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 786 {
 787         int len, ere = ctx->cflags & REG_EXTENDED;
 788         const char *p;
 789         tre_ast_node_t *node;
 790         wchar_t wc;
 791         switch (*s) {
 792         case '[':
 793                 return parse_bracket(ctx, s+1);
 794         case '\\':
 795                 p = tre_expand_macro(s+1);
 796                 if (p) {
 797                         /* assume \X expansion is a single atom */
 798                         reg_errcode_t err = parse_atom(ctx, p);
 799                         ctx->s = s+2;
 800                         return err;
 801                 }
 802                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 803                 switch (*++s) {
 804                 case 0:
 805                         return REG_EESCAPE;
 806                 case 'b':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 808                         break;
 809                 case 'B':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 811                         break;
 812                 case '<':
 813                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 814                         break;
 815                 case '>':
 816                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 817                         break;
 818                 case 'x':
 819                         s++;
 820                         int i, v = 0, c;
 821                         len = 2;
 822                         if (*s == '{') {
 823                                 len = 8;
 824                                 s++;
 825                         }
 826                         for (i=0; i<len && v<0x110000; i++) {
 827                                 c = hexval(s[i]);
 828                                 if (c < 0) break;
 829                                 v = 16*v + c;
 830                         }
 831                         s += i;
 832                         if (len == 8) {
 833                                 if (*s != '}')
 834                                         return REG_EBRACE;
 835                                 s++;
 836                         }
 837                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
 838                         ctx->position++;
 839                         s--;
 840                         break;
 841                 default:
 842                         if (isdigit(*s)) {
 843                                 /* back reference */
 844                                 int val = *s - '0';
 845                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
 846                                 ctx->max_backref = MAX(val, ctx->max_backref);
 847                         } else {
 848                                 /* extension: accept unknown escaped char
 849                                    as a literal */
 850                                 node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
 851                         }
 852                         ctx->position++;
 853                 }
 854                 s++;
 855                 break;
 856         case '.':
 857                 if (ctx->cflags & REG_NEWLINE) {
 858                         tre_ast_node_t *tmp1, *tmp2;
 859                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 860                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 861                         if (tmp1 && tmp2)
 862                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 863                         else
 864                                 node = 0;
 865                 } else {
 866                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 867                 }
 868                 s++;
 869                 break;
 870         case '^':
 871                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 872                 if (!ere && s != ctx->re)
 873                         goto parse_literal;
 874                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 875                 s++;
 876                 break;
 877         case '$':
 878                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 879                 if (!ere && s[1])
 880                         goto parse_literal;
 881                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 882                 s++;
 883                 break;
 884         case '*':
 885         case '|':
 886         case '{':
 887         case '+':
 888         case '?':
 889                 if (!ere)
 890                         goto parse_literal;
 891         case 0:
 892                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 893                 break;
 894         default:
 895 parse_literal:
 896                 len = mbtowc(&wc, s, -1);
 897                 if (len < 0)
 898                         return REG_BADPAT;
 899                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 900                         tre_ast_node_t *tmp1, *tmp2;
 901                         /* multiple opposite case characters are not supported */
 902                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 903                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 904                         if (tmp1 && tmp2)
 905                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 906                         else
 907                                 node = 0;
 908                 } else {
 909                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 910                 }
 911                 ctx->position++;
 912                 s += len;
 913                 break;
 914         }
 915         if (!node)
 916                 return REG_ESPACE;
 917         ctx->n = node;
 918         ctx->s = s;
 919         return REG_OK;
 920 }
 921
 922 #define PUSHPTR(err, s, v) do { \
 923         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 924                 return err; \
 925 } while(0)
 926
 927 #define PUSHINT(err, s, v) do { \
 928         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 929                 return err; \
 930 } while(0)
 931
 932 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 933 {
 934         tre_ast_node_t *nbranch=0, *nunion=0;
 935         int ere = ctx->cflags & REG_EXTENDED;
 936         const char *s = ctx->re;
 937         int subid = 0;
 938         int depth = 0;
 939         reg_errcode_t err;
 940         tre_stack_t *stack = ctx->stack;
 941
 942         PUSHINT(err, stack, subid++);
 943         for (;;) {
 944                 if ((!ere && *s == '\\' && s[1] == '(') ||
 945                     (ere && *s == '(')) {
 946                         PUSHPTR(err, stack, nunion);
 947                         PUSHPTR(err, stack, nbranch);
 948                         PUSHINT(err, stack, subid++);
 949                         s++;
 950                         if (!ere)
 951                                 s++;
 952                         depth++;
 953                         nbranch = nunion = 0;
 954                         continue;
 955                 }
 956                 if ((!ere && *s == '\\' && s[1] == ')') ||
 957                     (ere && *s == ')' && depth)) {
 958                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 959                         if (!ctx->n)
 960                                 return REG_ESPACE;
 961                 } else {
 962                         err = parse_atom(ctx, s);
 963                         if (err != REG_OK)
 964                                 return err;
 965                         s = ctx->s;
 966                 }
 967
 968         parse_iter:
 969                 /* extension: repetitions are accepted after an empty node
 970                    eg. (+), ^*, a$?, a|{2} */
 971                 switch (*s) {
 972                 case '+':
 973                 case '?':
 974                         if (!ere)
 975                                 break;
 976                         /* fallthrough */
 977                 case '*':;
 978                         int min=0, max=-1;
 979                         if (*s == '+')
 980                                 min = 1;
 981                         if (*s == '?')
 982                                 max = 1;
 983                         s++;
 984                         ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 985                         if (!ctx->n)
 986                                 return REG_ESPACE;
 987                         /* extension: multiple consecutive *+?{,} is unspecified,
 988                            but (a+)+ has to be supported so accepting a++ makes
 989                            sense, note however that the RE_DUP_MAX limit can be
 990                            circumvented: (a{255}){255} uses a lot of memory.. */
 991                         goto parse_iter;
 992                 case '\\':
 993                         if (ere || s[1] != '{')
 994                                 break;
 995                         s++;
 996                         goto parse_brace;
 997                 case '{':
 998                         if (!ere)
 999                                 break;
1000                 parse_brace:
1001                         err = parse_dup(ctx, s+1);
1002                         if (err != REG_OK)
1003                                 return err;
1004                         s = ctx->s;
1005                         goto parse_iter;
1006                 }
1007
1008                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1009                 if ((ere && *s == '|') ||
1010                     (ere && *s == ')' && depth) ||
1011                     (!ere && *s == '\\' && s[1] == ')') ||
1012                     !*s) {
1013                         /* extension: empty branch is unspecified (), (|a), (a|)
1014                            here they are not rejected but match on empty string */
1015                         int c = *s;
1016                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1017                         nbranch = 0;
1018                         if (c != '|') {
1019                                 if (c == '\\') {
1020                                         if (!depth) return REG_EPAREN;
1021                                         s+=2;
1022                                 } else if (c == ')')
1023                                         s++;
1024                                 depth--;
1025                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1026                                 if (err != REG_OK)
1027                                         return err;
1028                                 if (!c && depth<0) {
1029                                         ctx->submatch_id = subid;
1030                                         return REG_OK;
1031                                 }
1032                                 if (!c || depth<0)
1033                                         return REG_EPAREN;
1034                                 nbranch = tre_stack_pop_voidptr(stack);
1035                                 nunion = tre_stack_pop_voidptr(stack);
1036                                 goto parse_iter;
1037                         }
1038                         s++;
1039                 }
1040         }
1041 }
1042
1043
1044 /***********************************************************************
1045  from tre-compile.c
1046 ***********************************************************************/
1047
1048
1049 /*
1050   TODO:
1051    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1052      function calls.
1053 */
1054
1055 /*
1056   Algorithms to setup tags so that submatch addressing can be done.
1057 */
1058
1059
1060 /* Inserts a catenation node to the root of the tree given in `node'.
1061    As the left child a new tag with number `tag_id' to `node' is added,
1062    and the right child is the old root. */
1063 static reg_errcode_t
1064 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1065 {
1066   tre_catenation_t *c;
1067
1068   c = tre_mem_alloc(mem, sizeof(*c));
1069   if (c == NULL)
1070     return REG_ESPACE;
1071   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1072   if (c->left == NULL)
1073     return REG_ESPACE;
1074   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1075   if (c->right == NULL)
1076     return REG_ESPACE;
1077
1078   c->right->obj = node->obj;
1079   c->right->type = node->type;
1080   c->right->nullable = -1;
1081   c->right->submatch_id = -1;
1082   c->right->firstpos = NULL;
1083   c->right->lastpos = NULL;
1084   c->right->num_tags = 0;
1085   node->obj = c;
1086   node->type = CATENATION;
1087   return REG_OK;
1088 }
1089
1090 /* Inserts a catenation node to the root of the tree given in `node'.
1091    As the right child a new tag with number `tag_id' to `node' is added,
1092    and the left child is the old root. */
1093 static reg_errcode_t
1094 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1095 {
1096   tre_catenation_t *c;
1097
1098   c = tre_mem_alloc(mem, sizeof(*c));
1099   if (c == NULL)
1100     return REG_ESPACE;
1101   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1102   if (c->right == NULL)
1103     return REG_ESPACE;
1104   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1105   if (c->left == NULL)
1106     return REG_ESPACE;
1107
1108   c->left->obj = node->obj;
1109   c->left->type = node->type;
1110   c->left->nullable = -1;
1111   c->left->submatch_id = -1;
1112   c->left->firstpos = NULL;
1113   c->left->lastpos = NULL;
1114   c->left->num_tags = 0;
1115   node->obj = c;
1116   node->type = CATENATION;
1117   return REG_OK;
1118 }
1119
1120 typedef enum {
1121   ADDTAGS_RECURSE,
1122   ADDTAGS_AFTER_ITERATION,
1123   ADDTAGS_AFTER_UNION_LEFT,
1124   ADDTAGS_AFTER_UNION_RIGHT,
1125   ADDTAGS_AFTER_CAT_LEFT,
1126   ADDTAGS_AFTER_CAT_RIGHT,
1127   ADDTAGS_SET_SUBMATCH_END
1128 } tre_addtags_symbol_t;
1129
1130
1131 typedef struct {
1132   int tag;
1133   int next_tag;
1134 } tre_tag_states_t;
1135
1136
1137 /* Go through `regset' and set submatch data for submatches that are
1138    using this tag. */
1139 static void
1140 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1141 {
1142   int i;
1143
1144   for (i = 0; regset[i] >= 0; i++)
1145     {
1146       int id = regset[i] / 2;
1147       int start = !(regset[i] % 2);
1148       if (start)
1149         tnfa->submatch_data[id].so_tag = tag;
1150       else
1151         tnfa->submatch_data[id].eo_tag = tag;
1152     }
1153   regset[0] = -1;
1154 }
1155
1156
1157 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1158    subexpressions marked for submatch addressing can be traced. */
1159 static reg_errcode_t
1160 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1161              tre_tnfa_t *tnfa)
1162 {
1163   reg_errcode_t status = REG_OK;
1164   tre_addtags_symbol_t symbol;
1165   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1166   int bottom = tre_stack_num_objects(stack);
1167   /* True for first pass (counting number of needed tags) */
1168   int first_pass = (mem == NULL || tnfa == NULL);
1169   int *regset, *orig_regset;
1170   int num_tags = 0; /* Total number of tags. */
1171   int num_minimals = 0;  /* Number of special minimal tags. */
1172   int tag = 0;      /* The tag that is to be added next. */
1173   int next_tag = 1; /* Next tag to use after this one. */
1174   int *parents;     /* Stack of submatches the current submatch is
1175                        contained in. */
1176   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1177   tre_tag_states_t *saved_states;
1178
1179   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1180   if (!first_pass)
1181     {
1182       tnfa->end_tag = 0;
1183       tnfa->minimal_tags[0] = -1;
1184     }
1185
1186   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1187   if (regset == NULL)
1188     return REG_ESPACE;
1189   regset[0] = -1;
1190   orig_regset = regset;
1191
1192   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1193   if (parents == NULL)
1194     {
1195       xfree(regset);
1196       return REG_ESPACE;
1197     }
1198   parents[0] = -1;
1199
1200   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1201   if (saved_states == NULL)
1202     {
1203       xfree(regset);
1204       xfree(parents);
1205       return REG_ESPACE;
1206     }
1207   else
1208     {
1209       unsigned int i;
1210       for (i = 0; i <= tnfa->num_submatches; i++)
1211         saved_states[i].tag = -1;
1212     }
1213
1214   STACK_PUSH(stack, voidptr, node);
1215   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1216
1217   while (tre_stack_num_objects(stack) > bottom)
1218     {
1219       if (status != REG_OK)
1220         break;
1221
1222       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1223       switch (symbol)
1224         {
1225
1226         case ADDTAGS_SET_SUBMATCH_END:
1227           {
1228             int id = tre_stack_pop_int(stack);
1229             int i;
1230
1231             /* Add end of this submatch to regset. */
1232             for (i = 0; regset[i] >= 0; i++);
1233             regset[i] = id * 2 + 1;
1234             regset[i + 1] = -1;
1235
1236             /* Pop this submatch from the parents stack. */
1237             for (i = 0; parents[i] >= 0; i++);
1238             parents[i - 1] = -1;
1239             break;
1240           }
1241
1242         case ADDTAGS_RECURSE:
1243           node = tre_stack_pop_voidptr(stack);
1244
1245           if (node->submatch_id >= 0)
1246             {
1247               int id = node->submatch_id;
1248               int i;
1249
1250
1251               /* Add start of this submatch to regset. */
1252               for (i = 0; regset[i] >= 0; i++);
1253               regset[i] = id * 2;
1254               regset[i + 1] = -1;
1255
1256               if (!first_pass)
1257                 {
1258                   for (i = 0; parents[i] >= 0; i++);
1259                   tnfa->submatch_data[id].parents = NULL;
1260                   if (i > 0)
1261                     {
1262                       int *p = xmalloc(sizeof(*p) * (i + 1));
1263                       if (p == NULL)
1264                         {
1265                           status = REG_ESPACE;
1266                           break;
1267                         }
1268                       assert(tnfa->submatch_data[id].parents == NULL);
1269                       tnfa->submatch_data[id].parents = p;
1270                       for (i = 0; parents[i] >= 0; i++)
1271                         p[i] = parents[i];
1272                       p[i] = -1;
1273                     }
1274                 }
1275
1276               /* Add end of this submatch to regset after processing this
1277                  node. */
1278               STACK_PUSHX(stack, int, node->submatch_id);
1279               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1280             }
1281
1282           switch (node->type)
1283             {
1284             case LITERAL:
1285               {
1286                 tre_literal_t *lit = node->obj;
1287
1288                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1289                   {
1290                     int i;
1291                     if (regset[0] >= 0)
1292                       {
1293                         /* Regset is not empty, so add a tag before the
1294                            literal or backref. */
1295                         if (!first_pass)
1296                           {
1297                             status = tre_add_tag_left(mem, node, tag);
1298                             tnfa->tag_directions[tag] = direction;
1299                             if (minimal_tag >= 0)
1300                               {
1301                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1302                                 tnfa->minimal_tags[i] = tag;
1303                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1304                                 tnfa->minimal_tags[i + 2] = -1;
1305                                 minimal_tag = -1;
1306                                 num_minimals++;
1307                               }
1308                             tre_purge_regset(regset, tnfa, tag);
1309                           }
1310                         else
1311                           {
1312                             node->num_tags = 1;
1313                           }
1314
1315                         regset[0] = -1;
1316                         tag = next_tag;
1317                         num_tags++;
1318                         next_tag++;
1319                       }
1320                   }
1321                 else
1322                   {
1323                     assert(!IS_TAG(lit));
1324                   }
1325                 break;
1326               }
1327             case CATENATION:
1328               {
1329                 tre_catenation_t *cat = node->obj;
1330                 tre_ast_node_t *left = cat->left;
1331                 tre_ast_node_t *right = cat->right;
1332                 int reserved_tag = -1;
1333
1334
1335                 /* After processing right child. */
1336                 STACK_PUSHX(stack, voidptr, node);
1337                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1338
1339                 /* Process right child. */
1340                 STACK_PUSHX(stack, voidptr, right);
1341                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1342
1343                 /* After processing left child. */
1344                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1345                 if (left->num_tags > 0 && right->num_tags > 0)
1346                   {
1347                     /* Reserve the next tag to the right child. */
1348                     reserved_tag = next_tag;
1349                     next_tag++;
1350                   }
1351                 STACK_PUSHX(stack, int, reserved_tag);
1352                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1353
1354                 /* Process left child. */
1355                 STACK_PUSHX(stack, voidptr, left);
1356                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1357
1358                 }
1359               break;
1360             case ITERATION:
1361               {
1362                 tre_iteration_t *iter = node->obj;
1363
1364                 if (first_pass)
1365                   {
1366                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1367                   }
1368                 else
1369                   {
1370                     STACK_PUSHX(stack, int, tag);
1371                     STACK_PUSHX(stack, int, iter->minimal);
1372                   }
1373                 STACK_PUSHX(stack, voidptr, node);
1374                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1375
1376                 STACK_PUSHX(stack, voidptr, iter->arg);
1377                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1378
1379                 /* Regset is not empty, so add a tag here. */
1380                 if (regset[0] >= 0 || iter->minimal)
1381                   {
1382                     if (!first_pass)
1383                       {
1384                         int i;
1385                         status = tre_add_tag_left(mem, node, tag);
1386                         if (iter->minimal)
1387                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1388                         else
1389                           tnfa->tag_directions[tag] = direction;
1390                         if (minimal_tag >= 0)
1391                           {
1392                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1393                             tnfa->minimal_tags[i] = tag;
1394                             tnfa->minimal_tags[i + 1] = minimal_tag;
1395                             tnfa->minimal_tags[i + 2] = -1;
1396                             minimal_tag = -1;
1397                             num_minimals++;
1398                           }
1399                         tre_purge_regset(regset, tnfa, tag);
1400                       }
1401
1402                     regset[0] = -1;
1403                     tag = next_tag;
1404                     num_tags++;
1405                     next_tag++;
1406                   }
1407                 direction = TRE_TAG_MINIMIZE;
1408               }
1409               break;
1410             case UNION:
1411               {
1412                 tre_union_t *uni = node->obj;
1413                 tre_ast_node_t *left = uni->left;
1414                 tre_ast_node_t *right = uni->right;
1415                 int left_tag;
1416                 int right_tag;
1417
1418                 if (regset[0] >= 0)
1419                   {
1420                     left_tag = next_tag;
1421                     right_tag = next_tag + 1;
1422                   }
1423                 else
1424                   {
1425                     left_tag = tag;
1426                     right_tag = next_tag;
1427                   }
1428
1429                 /* After processing right child. */
1430                 STACK_PUSHX(stack, int, right_tag);
1431                 STACK_PUSHX(stack, int, left_tag);
1432                 STACK_PUSHX(stack, voidptr, regset);
1433                 STACK_PUSHX(stack, int, regset[0] >= 0);
1434                 STACK_PUSHX(stack, voidptr, node);
1435                 STACK_PUSHX(stack, voidptr, right);
1436                 STACK_PUSHX(stack, voidptr, left);
1437                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1438
1439                 /* Process right child. */
1440                 STACK_PUSHX(stack, voidptr, right);
1441                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1442
1443                 /* After processing left child. */
1444                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1445
1446                 /* Process left child. */
1447                 STACK_PUSHX(stack, voidptr, left);
1448                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1449
1450                 /* Regset is not empty, so add a tag here. */
1451                 if (regset[0] >= 0)
1452                   {
1453                     if (!first_pass)
1454                       {
1455                         int i;
1456                         status = tre_add_tag_left(mem, node, tag);
1457                         tnfa->tag_directions[tag] = direction;
1458                         if (minimal_tag >= 0)
1459                           {
1460                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1461                             tnfa->minimal_tags[i] = tag;
1462                             tnfa->minimal_tags[i + 1] = minimal_tag;
1463                             tnfa->minimal_tags[i + 2] = -1;
1464                             minimal_tag = -1;
1465                             num_minimals++;
1466                           }
1467                         tre_purge_regset(regset, tnfa, tag);
1468                       }
1469
1470                     regset[0] = -1;
1471                     tag = next_tag;
1472                     num_tags++;
1473                     next_tag++;
1474                   }
1475
1476                 if (node->num_submatches > 0)
1477                   {
1478                     /* The next two tags are reserved for markers. */
1479                     next_tag++;
1480                     tag = next_tag;
1481                     next_tag++;
1482                   }
1483
1484                 break;
1485               }
1486             }
1487
1488           if (node->submatch_id >= 0)
1489             {
1490               int i;
1491               /* Push this submatch on the parents stack. */
1492               for (i = 0; parents[i] >= 0; i++);
1493               parents[i] = node->submatch_id;
1494               parents[i + 1] = -1;
1495             }
1496
1497           break; /* end case: ADDTAGS_RECURSE */
1498
1499         case ADDTAGS_AFTER_ITERATION:
1500           {
1501             int minimal = 0;
1502             int enter_tag;
1503             node = tre_stack_pop_voidptr(stack);
1504             if (first_pass)
1505               {
1506                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1507                   + tre_stack_pop_int(stack);
1508                 minimal_tag = -1;
1509               }
1510             else
1511               {
1512                 minimal = tre_stack_pop_int(stack);
1513                 enter_tag = tre_stack_pop_int(stack);
1514                 if (minimal)
1515                   minimal_tag = enter_tag;
1516               }
1517
1518             if (!first_pass)
1519               {
1520                 if (minimal)
1521                   direction = TRE_TAG_MINIMIZE;
1522                 else
1523                   direction = TRE_TAG_MAXIMIZE;
1524               }
1525             break;
1526           }
1527
1528         case ADDTAGS_AFTER_CAT_LEFT:
1529           {
1530             int new_tag = tre_stack_pop_int(stack);
1531             next_tag = tre_stack_pop_int(stack);
1532             if (new_tag >= 0)
1533               {
1534                 tag = new_tag;
1535               }
1536             break;
1537           }
1538
1539         case ADDTAGS_AFTER_CAT_RIGHT:
1540           node = tre_stack_pop_voidptr(stack);
1541           if (first_pass)
1542             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1543               + ((tre_catenation_t *)node->obj)->right->num_tags;
1544           break;
1545
1546         case ADDTAGS_AFTER_UNION_LEFT:
1547           /* Lift the bottom of the `regset' array so that when processing
1548              the right operand the items currently in the array are
1549              invisible.  The original bottom was saved at ADDTAGS_UNION and
1550              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1551           while (*regset >= 0)
1552             regset++;
1553           break;
1554
1555         case ADDTAGS_AFTER_UNION_RIGHT:
1556           {
1557             int added_tags, tag_left, tag_right;
1558             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1559             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1560             node = tre_stack_pop_voidptr(stack);
1561             added_tags = tre_stack_pop_int(stack);
1562             if (first_pass)
1563               {
1564                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1565                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1566                   + ((node->num_submatches > 0) ? 2 : 0);
1567               }
1568             regset = tre_stack_pop_voidptr(stack);
1569             tag_left = tre_stack_pop_int(stack);
1570             tag_right = tre_stack_pop_int(stack);
1571
1572             /* Add tags after both children, the left child gets a smaller
1573                tag than the right child.  This guarantees that we prefer
1574                the left child over the right child. */
1575             /* XXX - This is not always necessary (if the children have
1576                tags which must be seen for every match of that child). */
1577             /* XXX - Check if this is the only place where tre_add_tag_right
1578                is used.  If so, use tre_add_tag_left (putting the tag before
1579                the child as opposed after the child) and throw away
1580                tre_add_tag_right. */
1581             if (node->num_submatches > 0)
1582               {
1583                 if (!first_pass)
1584                   {
1585                     status = tre_add_tag_right(mem, left, tag_left);
1586                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1587                     status = tre_add_tag_right(mem, right, tag_right);
1588                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1589                   }
1590                 num_tags += 2;
1591               }
1592             direction = TRE_TAG_MAXIMIZE;
1593             break;
1594           }
1595
1596         default:
1597           assert(0);
1598           break;
1599
1600         } /* end switch(symbol) */
1601     } /* end while(tre_stack_num_objects(stack) > bottom) */
1602
1603   if (!first_pass)
1604     tre_purge_regset(regset, tnfa, tag);
1605
1606   if (!first_pass && minimal_tag >= 0)
1607     {
1608       int i;
1609       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1610       tnfa->minimal_tags[i] = tag;
1611       tnfa->minimal_tags[i + 1] = minimal_tag;
1612       tnfa->minimal_tags[i + 2] = -1;
1613       minimal_tag = -1;
1614       num_minimals++;
1615     }
1616
1617   assert(tree->num_tags == num_tags);
1618   tnfa->end_tag = num_tags;
1619   tnfa->num_tags = num_tags;
1620   tnfa->num_minimals = num_minimals;
1621   xfree(orig_regset);
1622   xfree(parents);
1623   xfree(saved_states);
1624   return status;
1625 }
1626
1627
1628
1629 /*
1630   AST to TNFA compilation routines.
1631 */
1632
1633 typedef enum {
1634   COPY_RECURSE,
1635   COPY_SET_RESULT_PTR
1636 } tre_copyast_symbol_t;
1637
1638 /* Flags for tre_copy_ast(). */
1639 #define COPY_REMOVE_TAGS         1
1640 #define COPY_MAXIMIZE_FIRST_TAG  2
1641
1642 static reg_errcode_t
1643 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1644              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1645              tre_ast_node_t **copy, int *max_pos)
1646 {
1647   reg_errcode_t status = REG_OK;
1648   int bottom = tre_stack_num_objects(stack);
1649   int num_copied = 0;
1650   int first_tag = 1;
1651   tre_ast_node_t **result = copy;
1652   tre_copyast_symbol_t symbol;
1653
1654   STACK_PUSH(stack, voidptr, ast);
1655   STACK_PUSH(stack, int, COPY_RECURSE);
1656
1657   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1658     {
1659       tre_ast_node_t *node;
1660       if (status != REG_OK)
1661         break;
1662
1663       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1664       switch (symbol)
1665         {
1666         case COPY_SET_RESULT_PTR:
1667           result = tre_stack_pop_voidptr(stack);
1668           break;
1669         case COPY_RECURSE:
1670           node = tre_stack_pop_voidptr(stack);
1671           switch (node->type)
1672             {
1673             case LITERAL:
1674               {
1675                 tre_literal_t *lit = node->obj;
1676                 int pos = lit->position;
1677                 int min = lit->code_min;
1678                 int max = lit->code_max;
1679                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1680                   {
1681                     /* XXX - e.g. [ab] has only one position but two
1682                        nodes, so we are creating holes in the state space
1683                        here.  Not fatal, just wastes memory. */
1684                     pos += *pos_add;
1685                     num_copied++;
1686                   }
1687                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1688                   {
1689                     /* Change this tag to empty. */
1690                     min = EMPTY;
1691                     max = pos = -1;
1692                   }
1693                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1694                          && first_tag)
1695                   {
1696                     /* Maximize the first tag. */
1697                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1698                     first_tag = 0;
1699                   }
1700                 *result = tre_ast_new_literal(mem, min, max, pos);
1701                 if (*result == NULL)
1702                   status = REG_ESPACE;
1703
1704                 if (pos > *max_pos)
1705                   *max_pos = pos;
1706                 break;
1707               }
1708             case UNION:
1709               {
1710                 tre_union_t *uni = node->obj;
1711                 tre_union_t *tmp;
1712                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1713                 if (*result == NULL)
1714                   {
1715                     status = REG_ESPACE;
1716                     break;
1717                   }
1718                 tmp = (*result)->obj;
1719                 result = &tmp->left;
1720                 STACK_PUSHX(stack, voidptr, uni->right);
1721                 STACK_PUSHX(stack, int, COPY_RECURSE);
1722                 STACK_PUSHX(stack, voidptr, &tmp->right);
1723                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1724                 STACK_PUSHX(stack, voidptr, uni->left);
1725                 STACK_PUSHX(stack, int, COPY_RECURSE);
1726                 break;
1727               }
1728             case CATENATION:
1729               {
1730                 tre_catenation_t *cat = node->obj;
1731                 tre_catenation_t *tmp;
1732                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1733                 if (*result == NULL)
1734                   {
1735                     status = REG_ESPACE;
1736                     break;
1737                   }
1738                 tmp = (*result)->obj;
1739                 tmp->left = NULL;
1740                 tmp->right = NULL;
1741                 result = &tmp->left;
1742
1743                 STACK_PUSHX(stack, voidptr, cat->right);
1744                 STACK_PUSHX(stack, int, COPY_RECURSE);
1745                 STACK_PUSHX(stack, voidptr, &tmp->right);
1746                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1747                 STACK_PUSHX(stack, voidptr, cat->left);
1748                 STACK_PUSHX(stack, int, COPY_RECURSE);
1749                 break;
1750               }
1751             case ITERATION:
1752               {
1753                 tre_iteration_t *iter = node->obj;
1754                 STACK_PUSHX(stack, voidptr, iter->arg);
1755                 STACK_PUSHX(stack, int, COPY_RECURSE);
1756                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1757                                            iter->max, iter->minimal);
1758                 if (*result == NULL)
1759                   {
1760                     status = REG_ESPACE;
1761                     break;
1762                   }
1763                 iter = (*result)->obj;
1764                 result = &iter->arg;
1765                 break;
1766               }
1767             default:
1768               assert(0);
1769               break;
1770             }
1771           break;
1772         }
1773     }
1774   *pos_add += num_copied;
1775   return status;
1776 }
1777
1778 typedef enum {
1779   EXPAND_RECURSE,
1780   EXPAND_AFTER_ITER
1781 } tre_expand_ast_symbol_t;
1782
1783 /* Expands each iteration node that has a finite nonzero minimum or maximum
1784    iteration count to a catenated sequence of copies of the node. */
1785 static reg_errcode_t
1786 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1787                int *position, tre_tag_direction_t *tag_directions)
1788 {
1789   reg_errcode_t status = REG_OK;
1790   int bottom = tre_stack_num_objects(stack);
1791   int pos_add = 0;
1792   int pos_add_total = 0;
1793   int max_pos = 0;
1794   int iter_depth = 0;
1795
1796   STACK_PUSHR(stack, voidptr, ast);
1797   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1798   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1799     {
1800       tre_ast_node_t *node;
1801       tre_expand_ast_symbol_t symbol;
1802
1803       if (status != REG_OK)
1804         break;
1805
1806       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1807       node = tre_stack_pop_voidptr(stack);
1808       switch (symbol)
1809         {
1810         case EXPAND_RECURSE:
1811           switch (node->type)
1812             {
1813             case LITERAL:
1814               {
1815                 tre_literal_t *lit= node->obj;
1816                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1817                   {
1818                     lit->position += pos_add;
1819                     if (lit->position > max_pos)
1820                       max_pos = lit->position;
1821                   }
1822                 break;
1823               }
1824             case UNION:
1825               {
1826                 tre_union_t *uni = node->obj;
1827                 STACK_PUSHX(stack, voidptr, uni->right);
1828                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1829                 STACK_PUSHX(stack, voidptr, uni->left);
1830                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1831                 break;
1832               }
1833             case CATENATION:
1834               {
1835                 tre_catenation_t *cat = node->obj;
1836                 STACK_PUSHX(stack, voidptr, cat->right);
1837                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1838                 STACK_PUSHX(stack, voidptr, cat->left);
1839                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1840                 break;
1841               }
1842             case ITERATION:
1843               {
1844                 tre_iteration_t *iter = node->obj;
1845                 STACK_PUSHX(stack, int, pos_add);
1846                 STACK_PUSHX(stack, voidptr, node);
1847                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1848                 STACK_PUSHX(stack, voidptr, iter->arg);
1849                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1850                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1851                    then don't increase the `pos' fields of the nodes now, it
1852                    will get done when expanding. */
1853                 if (iter->min > 1 || iter->max > 1)
1854                   pos_add = 0;
1855                 iter_depth++;
1856                 break;
1857               }
1858             default:
1859               assert(0);
1860               break;
1861             }
1862           break;
1863         case EXPAND_AFTER_ITER:
1864           {
1865             tre_iteration_t *iter = node->obj;
1866             int pos_add_last;
1867             pos_add = tre_stack_pop_int(stack);
1868             pos_add_last = pos_add;
1869             if (iter->min > 1 || iter->max > 1)
1870               {
1871                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1872                 int j;
1873                 int pos_add_save = pos_add;
1874
1875                 /* Create a catenated sequence of copies of the node. */
1876                 for (j = 0; j < iter->min; j++)
1877                   {
1878                     tre_ast_node_t *copy;
1879                     /* Remove tags from all but the last copy. */
1880                     int flags = ((j + 1 < iter->min)
1881                                  ? COPY_REMOVE_TAGS
1882                                  : COPY_MAXIMIZE_FIRST_TAG);
1883                     pos_add_save = pos_add;
1884                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1885                                           &pos_add, tag_directions, &copy,
1886                                           &max_pos);
1887                     if (status != REG_OK)
1888                       return status;
1889                     if (seq1 != NULL)
1890                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1891                     else
1892                       seq1 = copy;
1893                     if (seq1 == NULL)
1894                       return REG_ESPACE;
1895                   }
1896
1897                 if (iter->max == -1)
1898                   {
1899                     /* No upper limit. */
1900                     pos_add_save = pos_add;
1901                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1902                                           &pos_add, NULL, &seq2, &max_pos);
1903                     if (status != REG_OK)
1904                       return status;
1905                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1906                     if (seq2 == NULL)
1907                       return REG_ESPACE;
1908                   }
1909                 else
1910                   {
1911                     for (j = iter->min; j < iter->max; j++)
1912                       {
1913                         tre_ast_node_t *tmp, *copy;
1914                         pos_add_save = pos_add;
1915                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1916                                               &pos_add, NULL, &copy, &max_pos);
1917                         if (status != REG_OK)
1918                           return status;
1919                         if (seq2 != NULL)
1920                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1921                         else
1922                           seq2 = copy;
1923                         if (seq2 == NULL)
1924                           return REG_ESPACE;
1925                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1926                         if (tmp == NULL)
1927                           return REG_ESPACE;
1928                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1929                         if (seq2 == NULL)
1930                           return REG_ESPACE;
1931                       }
1932                   }
1933
1934                 pos_add = pos_add_save;
1935                 if (seq1 == NULL)
1936                   seq1 = seq2;
1937                 else if (seq2 != NULL)
1938                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1939                 if (seq1 == NULL)
1940                   return REG_ESPACE;
1941                 node->obj = seq1->obj;
1942                 node->type = seq1->type;
1943               }
1944
1945             iter_depth--;
1946             pos_add_total += pos_add - pos_add_last;
1947             if (iter_depth == 0)
1948               pos_add = pos_add_total;
1949
1950             break;
1951           }
1952         default:
1953           assert(0);
1954           break;
1955         }
1956     }
1957
1958   *position += pos_add_total;
1959
1960   /* `max_pos' should never be larger than `*position' if the above
1961      code works, but just an extra safeguard let's make sure
1962      `*position' is set large enough so enough memory will be
1963      allocated for the transition table. */
1964   if (max_pos > *position)
1965     *position = max_pos;
1966
1967   return status;
1968 }
1969
1970 static tre_pos_and_tags_t *
1971 tre_set_empty(tre_mem_t mem)
1972 {
1973   tre_pos_and_tags_t *new_set;
1974
1975   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1976   if (new_set == NULL)
1977     return NULL;
1978
1979   new_set[0].position = -1;
1980   new_set[0].code_min = -1;
1981   new_set[0].code_max = -1;
1982
1983   return new_set;
1984 }
1985
1986 static tre_pos_and_tags_t *
1987 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1988             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1989 {
1990   tre_pos_and_tags_t *new_set;
1991
1992   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1993   if (new_set == NULL)
1994     return NULL;
1995
1996   new_set[0].position = position;
1997   new_set[0].code_min = code_min;
1998   new_set[0].code_max = code_max;
1999   new_set[0].class = class;
2000   new_set[0].neg_classes = neg_classes;
2001   new_set[0].backref = backref;
2002   new_set[1].position = -1;
2003   new_set[1].code_min = -1;
2004   new_set[1].code_max = -1;
2005
2006   return new_set;
2007 }
2008
2009 static tre_pos_and_tags_t *
2010 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2011               int *tags, int assertions)
2012 {
2013   int s1, s2, i, j;
2014   tre_pos_and_tags_t *new_set;
2015   int *new_tags;
2016   int num_tags;
2017
2018   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2019   for (s1 = 0; set1[s1].position >= 0; s1++);
2020   for (s2 = 0; set2[s2].position >= 0; s2++);
2021   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2022   if (!new_set )
2023     return NULL;
2024
2025   for (s1 = 0; set1[s1].position >= 0; s1++)
2026     {
2027       new_set[s1].position = set1[s1].position;
2028       new_set[s1].code_min = set1[s1].code_min;
2029       new_set[s1].code_max = set1[s1].code_max;
2030       new_set[s1].assertions = set1[s1].assertions | assertions;
2031       new_set[s1].class = set1[s1].class;
2032       new_set[s1].neg_classes = set1[s1].neg_classes;
2033       new_set[s1].backref = set1[s1].backref;
2034       if (set1[s1].tags == NULL && tags == NULL)
2035         new_set[s1].tags = NULL;
2036       else
2037         {
2038           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2039           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2040                                          * (i + num_tags + 1)));
2041           if (new_tags == NULL)
2042             return NULL;
2043           for (j = 0; j < i; j++)
2044             new_tags[j] = set1[s1].tags[j];
2045           for (i = 0; i < num_tags; i++)
2046             new_tags[j + i] = tags[i];
2047           new_tags[j + i] = -1;
2048           new_set[s1].tags = new_tags;
2049         }
2050     }
2051
2052   for (s2 = 0; set2[s2].position >= 0; s2++)
2053     {
2054       new_set[s1 + s2].position = set2[s2].position;
2055       new_set[s1 + s2].code_min = set2[s2].code_min;
2056       new_set[s1 + s2].code_max = set2[s2].code_max;
2057       /* XXX - why not | assertions here as well? */
2058       new_set[s1 + s2].assertions = set2[s2].assertions;
2059       new_set[s1 + s2].class = set2[s2].class;
2060       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2061       new_set[s1 + s2].backref = set2[s2].backref;
2062       if (set2[s2].tags == NULL)
2063         new_set[s1 + s2].tags = NULL;
2064       else
2065         {
2066           for (i = 0; set2[s2].tags[i] >= 0; i++);
2067           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2068           if (new_tags == NULL)
2069             return NULL;
2070           for (j = 0; j < i; j++)
2071             new_tags[j] = set2[s2].tags[j];
2072           new_tags[j] = -1;
2073           new_set[s1 + s2].tags = new_tags;
2074         }
2075     }
2076   new_set[s1 + s2].position = -1;
2077   return new_set;
2078 }
2079
2080 /* Finds the empty path through `node' which is the one that should be
2081    taken according to POSIX.2 rules, and adds the tags on that path to
2082    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2083    set to the number of tags seen on the path. */
2084 static reg_errcode_t
2085 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2086                 int *assertions, int *num_tags_seen)
2087 {
2088   tre_literal_t *lit;
2089   tre_union_t *uni;
2090   tre_catenation_t *cat;
2091   tre_iteration_t *iter;
2092   int i;
2093   int bottom = tre_stack_num_objects(stack);
2094   reg_errcode_t status = REG_OK;
2095   if (num_tags_seen)
2096     *num_tags_seen = 0;
2097
2098   status = tre_stack_push_voidptr(stack, node);
2099
2100   /* Walk through the tree recursively. */
2101   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2102     {
2103       node = tre_stack_pop_voidptr(stack);
2104
2105       switch (node->type)
2106         {
2107         case LITERAL:
2108           lit = (tre_literal_t *)node->obj;
2109           switch (lit->code_min)
2110             {
2111             case TAG:
2112               if (lit->code_max >= 0)
2113                 {
2114                   if (tags != NULL)
2115                     {
2116                       /* Add the tag to `tags'. */
2117                       for (i = 0; tags[i] >= 0; i++)
2118                         if (tags[i] == lit->code_max)
2119                           break;
2120                       if (tags[i] < 0)
2121                         {
2122                           tags[i] = lit->code_max;
2123                           tags[i + 1] = -1;
2124                         }
2125                     }
2126                   if (num_tags_seen)
2127                     (*num_tags_seen)++;
2128                 }
2129               break;
2130             case ASSERTION:
2131               assert(lit->code_max >= 1
2132                      || lit->code_max <= ASSERT_LAST);
2133               if (assertions != NULL)
2134                 *assertions |= lit->code_max;
2135               break;
2136             case EMPTY:
2137               break;
2138             default:
2139               assert(0);
2140               break;
2141             }
2142           break;
2143
2144         case UNION:
2145           /* Subexpressions starting earlier take priority over ones
2146              starting later, so we prefer the left subexpression over the
2147              right subexpression. */
2148           uni = (tre_union_t *)node->obj;
2149           if (uni->left->nullable)
2150             STACK_PUSHX(stack, voidptr, uni->left)
2151           else if (uni->right->nullable)
2152             STACK_PUSHX(stack, voidptr, uni->right)
2153           else
2154             assert(0);
2155           break;
2156
2157         case CATENATION:
2158           /* The path must go through both children. */
2159           cat = (tre_catenation_t *)node->obj;
2160           assert(cat->left->nullable);
2161           assert(cat->right->nullable);
2162           STACK_PUSHX(stack, voidptr, cat->left);
2163           STACK_PUSHX(stack, voidptr, cat->right);
2164           break;
2165
2166         case ITERATION:
2167           /* A match with an empty string is preferred over no match at
2168              all, so we go through the argument if possible. */
2169           iter = (tre_iteration_t *)node->obj;
2170           if (iter->arg->nullable)
2171             STACK_PUSHX(stack, voidptr, iter->arg);
2172           break;
2173
2174         default:
2175           assert(0);
2176           break;
2177         }
2178     }
2179
2180   return status;
2181 }
2182
2183
2184 typedef enum {
2185   NFL_RECURSE,
2186   NFL_POST_UNION,
2187   NFL_POST_CATENATION,
2188   NFL_POST_ITERATION
2189 } tre_nfl_stack_symbol_t;
2190
2191
2192 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2193    the nodes of the AST `tree'. */
2194 static reg_errcode_t
2195 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2196 {
2197   int bottom = tre_stack_num_objects(stack);
2198
2199   STACK_PUSHR(stack, voidptr, tree);
2200   STACK_PUSHR(stack, int, NFL_RECURSE);
2201
2202   while (tre_stack_num_objects(stack) > bottom)
2203     {
2204       tre_nfl_stack_symbol_t symbol;
2205       tre_ast_node_t *node;
2206
2207       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2208       node = tre_stack_pop_voidptr(stack);
2209       switch (symbol)
2210         {
2211         case NFL_RECURSE:
2212           switch (node->type)
2213             {
2214             case LITERAL:
2215               {
2216                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2217                 if (IS_BACKREF(lit))
2218                   {
2219                     /* Back references: nullable = false, firstpos = {i},
2220                        lastpos = {i}. */
2221                     node->nullable = 0;
2222                     node->firstpos = tre_set_one(mem, lit->position, 0,
2223                                              TRE_CHAR_MAX, 0, NULL, -1);
2224                     if (!node->firstpos)
2225                       return REG_ESPACE;
2226                     node->lastpos = tre_set_one(mem, lit->position, 0,
2227                                                 TRE_CHAR_MAX, 0, NULL,
2228                                                 (int)lit->code_max);
2229                     if (!node->lastpos)
2230                       return REG_ESPACE;
2231                   }
2232                 else if (lit->code_min < 0)
2233                   {
2234                     /* Tags, empty strings, params, and zero width assertions:
2235                        nullable = true, firstpos = {}, and lastpos = {}. */
2236                     node->nullable = 1;
2237                     node->firstpos = tre_set_empty(mem);
2238                     if (!node->firstpos)
2239                       return REG_ESPACE;
2240                     node->lastpos = tre_set_empty(mem);
2241                     if (!node->lastpos)
2242                       return REG_ESPACE;
2243                   }
2244                 else
2245                   {
2246                     /* Literal at position i: nullable = false, firstpos = {i},
2247                        lastpos = {i}. */
2248                     node->nullable = 0;
2249                     node->firstpos =
2250                       tre_set_one(mem, lit->position, (int)lit->code_min,
2251                                   (int)lit->code_max, 0, NULL, -1);
2252                     if (!node->firstpos)
2253                       return REG_ESPACE;
2254                     node->lastpos = tre_set_one(mem, lit->position,
2255                                                 (int)lit->code_min,
2256                                                 (int)lit->code_max,
2257                                                 lit->class, lit->neg_classes,
2258                                                 -1);
2259                     if (!node->lastpos)
2260                       return REG_ESPACE;
2261                   }
2262                 break;
2263               }
2264
2265             case UNION:
2266               /* Compute the attributes for the two subtrees, and after that
2267                  for this node. */
2268               STACK_PUSHR(stack, voidptr, node);
2269               STACK_PUSHR(stack, int, NFL_POST_UNION);
2270               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2271               STACK_PUSHR(stack, int, NFL_RECURSE);
2272               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2273               STACK_PUSHR(stack, int, NFL_RECURSE);
2274               break;
2275
2276             case CATENATION:
2277               /* Compute the attributes for the two subtrees, and after that
2278                  for this node. */
2279               STACK_PUSHR(stack, voidptr, node);
2280               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2281               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2282               STACK_PUSHR(stack, int, NFL_RECURSE);
2283               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2284               STACK_PUSHR(stack, int, NFL_RECURSE);
2285               break;
2286
2287             case ITERATION:
2288               /* Compute the attributes for the subtree, and after that for
2289                  this node. */
2290               STACK_PUSHR(stack, voidptr, node);
2291               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2292               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2293               STACK_PUSHR(stack, int, NFL_RECURSE);
2294               break;
2295             }
2296           break; /* end case: NFL_RECURSE */
2297
2298         case NFL_POST_UNION:
2299           {
2300             tre_union_t *uni = (tre_union_t *)node->obj;
2301             node->nullable = uni->left->nullable || uni->right->nullable;
2302             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2303                                            uni->right->firstpos, NULL, 0);
2304             if (!node->firstpos)
2305               return REG_ESPACE;
2306             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2307                                           uni->right->lastpos, NULL, 0);
2308             if (!node->lastpos)
2309               return REG_ESPACE;
2310             break;
2311           }
2312
2313         case NFL_POST_ITERATION:
2314           {
2315             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2316
2317             if (iter->min == 0 || iter->arg->nullable)
2318               node->nullable = 1;
2319             else
2320               node->nullable = 0;
2321             node->firstpos = iter->arg->firstpos;
2322             node->lastpos = iter->arg->lastpos;
2323             break;
2324           }
2325
2326         case NFL_POST_CATENATION:
2327           {
2328             int num_tags, *tags, assertions;
2329             reg_errcode_t status;
2330             tre_catenation_t *cat = node->obj;
2331             node->nullable = cat->left->nullable && cat->right->nullable;
2332
2333             /* Compute firstpos. */
2334             if (cat->left->nullable)
2335               {
2336                 /* The left side matches the empty string.  Make a first pass
2337                    with tre_match_empty() to get the number of tags and
2338                    parameters. */
2339                 status = tre_match_empty(stack, cat->left,
2340                                          NULL, NULL, &num_tags);
2341                 if (status != REG_OK)
2342                   return status;
2343                 /* Allocate arrays for the tags and parameters. */
2344                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2345                 if (!tags)
2346                   return REG_ESPACE;
2347                 tags[0] = -1;
2348                 assertions = 0;
2349                 /* Second pass with tre_mach_empty() to get the list of
2350                    tags and parameters. */
2351                 status = tre_match_empty(stack, cat->left, tags,
2352                                          &assertions, NULL);
2353                 if (status != REG_OK)
2354                   {
2355                     xfree(tags);
2356                     return status;
2357                   }
2358                 node->firstpos =
2359                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2360                                 tags, assertions);
2361                 xfree(tags);
2362                 if (!node->firstpos)
2363                   return REG_ESPACE;
2364               }
2365             else
2366               {
2367                 node->firstpos = cat->left->firstpos;
2368               }
2369
2370             /* Compute lastpos. */
2371             if (cat->right->nullable)
2372               {
2373                 /* The right side matches the empty string.  Make a first pass
2374                    with tre_match_empty() to get the number of tags and
2375                    parameters. */
2376                 status = tre_match_empty(stack, cat->right,
2377                                          NULL, NULL, &num_tags);
2378                 if (status != REG_OK)
2379                   return status;
2380                 /* Allocate arrays for the tags and parameters. */
2381                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2382                 if (!tags)
2383                   return REG_ESPACE;
2384                 tags[0] = -1;
2385                 assertions = 0;
2386                 /* Second pass with tre_mach_empty() to get the list of
2387                    tags and parameters. */
2388                 status = tre_match_empty(stack, cat->right, tags,
2389                                          &assertions, NULL);
2390                 if (status != REG_OK)
2391                   {
2392                     xfree(tags);
2393                     return status;
2394                   }
2395                 node->lastpos =
2396                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2397                                 tags, assertions);
2398                 xfree(tags);
2399                 if (!node->lastpos)
2400                   return REG_ESPACE;
2401               }
2402             else
2403               {
2404                 node->lastpos = cat->right->lastpos;
2405               }
2406             break;
2407           }
2408
2409         default:
2410           assert(0);
2411           break;
2412         }
2413     }
2414
2415   return REG_OK;
2416 }
2417
2418
2419 /* Adds a transition from each position in `p1' to each position in `p2'. */
2420 static reg_errcode_t
2421 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2422                tre_tnfa_transition_t *transitions,
2423                int *counts, int *offs)
2424 {
2425   tre_pos_and_tags_t *orig_p2 = p2;
2426   tre_tnfa_transition_t *trans;
2427   int i, j, k, l, dup, prev_p2_pos;
2428
2429   if (transitions != NULL)
2430     while (p1->position >= 0)
2431       {
2432         p2 = orig_p2;
2433         prev_p2_pos = -1;
2434         while (p2->position >= 0)
2435           {
2436             /* Optimization: if this position was already handled, skip it. */
2437             if (p2->position == prev_p2_pos)
2438               {
2439                 p2++;
2440                 continue;
2441               }
2442             prev_p2_pos = p2->position;
2443             /* Set `trans' to point to the next unused transition from
2444                position `p1->position'. */
2445             trans = transitions + offs[p1->position];
2446             while (trans->state != NULL)
2447               {
2448 #if 0
2449                 /* If we find a previous transition from `p1->position' to
2450                    `p2->position', it is overwritten.  This can happen only
2451                    if there are nested loops in the regexp, like in "((a)*)*".
2452                    In POSIX.2 repetition using the outer loop is always
2453                    preferred over using the inner loop.  Therefore the
2454                    transition for the inner loop is useless and can be thrown
2455                    away. */
2456                 /* XXX - The same position is used for all nodes in a bracket
2457                    expression, so this optimization cannot be used (it will
2458                    break bracket expressions) unless I figure out a way to
2459                    detect it here. */
2460                 if (trans->state_id == p2->position)
2461                   {
2462                     break;
2463                   }
2464 #endif
2465                 trans++;
2466               }
2467
2468             if (trans->state == NULL)
2469               (trans + 1)->state = NULL;
2470             /* Use the character ranges, assertions, etc. from `p1' for
2471                the transition from `p1' to `p2'. */
2472             trans->code_min = p1->code_min;
2473             trans->code_max = p1->code_max;
2474             trans->state = transitions + offs[p2->position];
2475             trans->state_id = p2->position;
2476             trans->assertions = p1->assertions | p2->assertions
2477               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2478               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2479             if (p1->backref >= 0)
2480               {
2481                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2482                 assert(p2->backref < 0);
2483                 trans->u.backref = p1->backref;
2484                 trans->assertions |= ASSERT_BACKREF;
2485               }
2486             else
2487               trans->u.class = p1->class;
2488             if (p1->neg_classes != NULL)
2489               {
2490                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2491                 trans->neg_classes =
2492                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2493                 if (trans->neg_classes == NULL)
2494                   return REG_ESPACE;
2495                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2496                   trans->neg_classes[i] = p1->neg_classes[i];
2497                 trans->neg_classes[i] = (tre_ctype_t)0;
2498               }
2499             else
2500               trans->neg_classes = NULL;
2501
2502             /* Find out how many tags this transition has. */
2503             i = 0;
2504             if (p1->tags != NULL)
2505               while(p1->tags[i] >= 0)
2506                 i++;
2507             j = 0;
2508             if (p2->tags != NULL)
2509               while(p2->tags[j] >= 0)
2510                 j++;
2511
2512             /* If we are overwriting a transition, free the old tag array. */
2513             if (trans->tags != NULL)
2514               xfree(trans->tags);
2515             trans->tags = NULL;
2516
2517             /* If there were any tags, allocate an array and fill it. */
2518             if (i + j > 0)
2519               {
2520                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2521                 if (!trans->tags)
2522                   return REG_ESPACE;
2523                 i = 0;
2524                 if (p1->tags != NULL)
2525                   while(p1->tags[i] >= 0)
2526                     {
2527                       trans->tags[i] = p1->tags[i];
2528                       i++;
2529                     }
2530                 l = i;
2531                 j = 0;
2532                 if (p2->tags != NULL)
2533                   while (p2->tags[j] >= 0)
2534                     {
2535                       /* Don't add duplicates. */
2536                       dup = 0;
2537                       for (k = 0; k < i; k++)
2538                         if (trans->tags[k] == p2->tags[j])
2539                           {
2540                             dup = 1;
2541                             break;
2542                           }
2543                       if (!dup)
2544                         trans->tags[l++] = p2->tags[j];
2545                       j++;
2546                     }
2547                 trans->tags[l] = -1;
2548               }
2549
2550             p2++;
2551           }
2552         p1++;
2553       }
2554   else
2555     /* Compute a maximum limit for the number of transitions leaving
2556        from each state. */
2557     while (p1->position >= 0)
2558       {
2559         p2 = orig_p2;
2560         while (p2->position >= 0)
2561           {
2562             counts[p1->position]++;
2563             p2++;
2564           }
2565         p1++;
2566       }
2567   return REG_OK;
2568 }
2569
2570 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2571    labelled with one character range (there are no transitions on empty
2572    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2573    the regexp. */
2574 static reg_errcode_t
2575 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2576                 int *counts, int *offs)
2577 {
2578   tre_union_t *uni;
2579   tre_catenation_t *cat;
2580   tre_iteration_t *iter;
2581   reg_errcode_t errcode = REG_OK;
2582
2583   /* XXX - recurse using a stack!. */
2584   switch (node->type)
2585     {
2586     case LITERAL:
2587       break;
2588     case UNION:
2589       uni = (tre_union_t *)node->obj;
2590       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2591       if (errcode != REG_OK)
2592         return errcode;
2593       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2594       break;
2595
2596     case CATENATION:
2597       cat = (tre_catenation_t *)node->obj;
2598       /* Add a transition from each position in cat->left->lastpos
2599          to each position in cat->right->firstpos. */
2600       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2601                                transitions, counts, offs);
2602       if (errcode != REG_OK)
2603         return errcode;
2604       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2605       if (errcode != REG_OK)
2606         return errcode;
2607       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2608       break;
2609
2610     case ITERATION:
2611       iter = (tre_iteration_t *)node->obj;
2612       assert(iter->max == -1 || iter->max == 1);
2613
2614       if (iter->max == -1)
2615         {
2616           assert(iter->min == 0 || iter->min == 1);
2617           /* Add a transition from each last position in the iterated
2618              expression to each first position. */
2619           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2620                                    transitions, counts, offs);
2621           if (errcode != REG_OK)
2622             return errcode;
2623         }
2624       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2625       break;
2626     }
2627   return errcode;
2628 }
2629
2630
2631 #define ERROR_EXIT(err)           \
2632   do                              \
2633     {                             \
2634       errcode = err;              \
2635       if (/*CONSTCOND*/1)         \
2636         goto error_exit;          \
2637     }                             \
2638  while (/*CONSTCOND*/0)
2639
2640
2641 int
2642 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2643 {
2644   tre_stack_t *stack;
2645   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2646   tre_pos_and_tags_t *p;
2647   int *counts = NULL, *offs = NULL;
2648   int i, add = 0;
2649   tre_tnfa_transition_t *transitions, *initial;
2650   tre_tnfa_t *tnfa = NULL;
2651   tre_submatch_data_t *submatch_data;
2652   tre_tag_direction_t *tag_directions = NULL;
2653   reg_errcode_t errcode;
2654   tre_mem_t mem;
2655
2656   /* Parse context. */
2657   tre_parse_ctx_t parse_ctx;
2658
2659   /* Allocate a stack used throughout the compilation process for various
2660      purposes. */
2661   stack = tre_stack_new(512, 10240, 128);
2662   if (!stack)
2663     return REG_ESPACE;
2664   /* Allocate a fast memory allocator. */
2665   mem = tre_mem_new();
2666   if (!mem)
2667     {
2668       tre_stack_destroy(stack);
2669       return REG_ESPACE;
2670     }
2671
2672   /* Parse the regexp. */
2673   memset(&parse_ctx, 0, sizeof(parse_ctx));
2674   parse_ctx.mem = mem;
2675   parse_ctx.stack = stack;
2676   parse_ctx.re = regex;
2677   parse_ctx.cflags = cflags;
2678   parse_ctx.max_backref = -1;
2679   errcode = tre_parse(&parse_ctx);
2680   if (errcode != REG_OK)
2681     ERROR_EXIT(errcode);
2682   preg->re_nsub = parse_ctx.submatch_id - 1;
2683   tree = parse_ctx.n;
2684
2685 #ifdef TRE_DEBUG
2686   tre_ast_print(tree);
2687 #endif /* TRE_DEBUG */
2688
2689   /* Referring to nonexistent subexpressions is illegal. */
2690   if (parse_ctx.max_backref > (int)preg->re_nsub)
2691     ERROR_EXIT(REG_ESUBREG);
2692
2693   /* Allocate the TNFA struct. */
2694   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2695   if (tnfa == NULL)
2696     ERROR_EXIT(REG_ESPACE);
2697   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2698   tnfa->have_approx = 0;
2699   tnfa->num_submatches = parse_ctx.submatch_id;
2700
2701   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2702      regexp does not have back references, this can be skipped. */
2703   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2704     {
2705
2706       /* Figure out how many tags we will need. */
2707       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2708       if (errcode != REG_OK)
2709         ERROR_EXIT(errcode);
2710
2711       if (tnfa->num_tags > 0)
2712         {
2713           tag_directions = xmalloc(sizeof(*tag_directions)
2714                                    * (tnfa->num_tags + 1));
2715           if (tag_directions == NULL)
2716             ERROR_EXIT(REG_ESPACE);
2717           tnfa->tag_directions = tag_directions;
2718           memset(tag_directions, -1,
2719                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2720         }
2721       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2722                                    sizeof(*tnfa->minimal_tags));
2723       if (tnfa->minimal_tags == NULL)
2724         ERROR_EXIT(REG_ESPACE);
2725
2726       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2727                               sizeof(*submatch_data));
2728       if (submatch_data == NULL)
2729         ERROR_EXIT(REG_ESPACE);
2730       tnfa->submatch_data = submatch_data;
2731
2732       errcode = tre_add_tags(mem, stack, tree, tnfa);
2733       if (errcode != REG_OK)
2734         ERROR_EXIT(errcode);
2735
2736     }
2737
2738   /* Expand iteration nodes. */
2739   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2740                            tag_directions);
2741   if (errcode != REG_OK)
2742     ERROR_EXIT(errcode);
2743
2744   /* Add a dummy node for the final state.
2745      XXX - For certain patterns this dummy node can be optimized away,
2746            for example "a*" or "ab*".   Figure out a simple way to detect
2747            this possibility. */
2748   tmp_ast_l = tree;
2749   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2750   if (tmp_ast_r == NULL)
2751     ERROR_EXIT(REG_ESPACE);
2752
2753   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2754   if (tree == NULL)
2755     ERROR_EXIT(REG_ESPACE);
2756
2757   errcode = tre_compute_nfl(mem, stack, tree);
2758   if (errcode != REG_OK)
2759     ERROR_EXIT(errcode);
2760
2761   counts = xmalloc(sizeof(int) * parse_ctx.position);
2762   if (counts == NULL)
2763     ERROR_EXIT(REG_ESPACE);
2764
2765   offs = xmalloc(sizeof(int) * parse_ctx.position);
2766   if (offs == NULL)
2767     ERROR_EXIT(REG_ESPACE);
2768
2769   for (i = 0; i < parse_ctx.position; i++)
2770     counts[i] = 0;
2771   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2772
2773   add = 0;
2774   for (i = 0; i < parse_ctx.position; i++)
2775     {
2776       offs[i] = add;
2777       add += counts[i] + 1;
2778       counts[i] = 0;
2779     }
2780   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2781   if (transitions == NULL)
2782     ERROR_EXIT(REG_ESPACE);
2783   tnfa->transitions = transitions;
2784   tnfa->num_transitions = add;
2785
2786   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2787   if (errcode != REG_OK)
2788     ERROR_EXIT(errcode);
2789
2790   tnfa->firstpos_chars = NULL;
2791
2792   p = tree->firstpos;
2793   i = 0;
2794   while (p->position >= 0)
2795     {
2796       i++;
2797       p++;
2798     }
2799
2800   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2801   if (initial == NULL)
2802     ERROR_EXIT(REG_ESPACE);
2803   tnfa->initial = initial;
2804
2805   i = 0;
2806   for (p = tree->firstpos; p->position >= 0; p++)
2807     {
2808       initial[i].state = transitions + offs[p->position];
2809       initial[i].state_id = p->position;
2810       initial[i].tags = NULL;
2811       /* Copy the arrays p->tags, and p->params, they are allocated
2812          from a tre_mem object. */
2813       if (p->tags)
2814         {
2815           int j;
2816           for (j = 0; p->tags[j] >= 0; j++);
2817           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2818           if (!initial[i].tags)
2819             ERROR_EXIT(REG_ESPACE);
2820           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2821         }
2822       initial[i].assertions = p->assertions;
2823       i++;
2824     }
2825   initial[i].state = NULL;
2826
2827   tnfa->num_transitions = add;
2828   tnfa->final = transitions + offs[tree->lastpos[0].position];
2829   tnfa->num_states = parse_ctx.position;
2830   tnfa->cflags = cflags;
2831
2832   tre_mem_destroy(mem);
2833   tre_stack_destroy(stack);
2834   xfree(counts);
2835   xfree(offs);
2836
2837   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2838   return REG_OK;
2839
2840  error_exit:
2841   /* Free everything that was allocated and return the error code. */
2842   tre_mem_destroy(mem);
2843   if (stack != NULL)
2844     tre_stack_destroy(stack);
2845   if (counts != NULL)
2846     xfree(counts);
2847   if (offs != NULL)
2848     xfree(offs);
2849   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2850   regfree(preg);
2851   return errcode;
2852 }
2853
2854
2855
2856
2857 void
2858 regfree(regex_t *preg)
2859 {
2860   tre_tnfa_t *tnfa;
2861   unsigned int i;
2862   tre_tnfa_transition_t *trans;
2863
2864   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2865   if (!tnfa)
2866     return;
2867
2868   for (i = 0; i < tnfa->num_transitions; i++)
2869     if (tnfa->transitions[i].state)
2870       {
2871         if (tnfa->transitions[i].tags)
2872           xfree(tnfa->transitions[i].tags);
2873         if (tnfa->transitions[i].neg_classes)
2874           xfree(tnfa->transitions[i].neg_classes);
2875       }
2876   if (tnfa->transitions)
2877     xfree(tnfa->transitions);
2878
2879   if (tnfa->initial)
2880     {
2881       for (trans = tnfa->initial; trans->state; trans++)
2882         {
2883           if (trans->tags)
2884             xfree(trans->tags);
2885         }
2886       xfree(tnfa->initial);
2887     }
2888
2889   if (tnfa->submatch_data)
2890     {
2891       for (i = 0; i < tnfa->num_submatches; i++)
2892         if (tnfa->submatch_data[i].parents)
2893           xfree(tnfa->submatch_data[i].parents);
2894       xfree(tnfa->submatch_data);
2895     }
2896
2897   if (tnfa->tag_directions)
2898     xfree(tnfa->tag_directions);
2899   if (tnfa->firstpos_chars)
2900     xfree(tnfa->firstpos_chars);
2901   if (tnfa->minimal_tags)
2902     xfree(tnfa->minimal_tags);
2903   xfree(tnfa);
2904 }