nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!ere && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return 0;
 730         *pmin = min;
 731         *pmax = max;
 732         return s;
 733 }
 734
 735 static int hexval(unsigned c)
 736 {
 737         if (c-'0'<10) return c-'0';
 738         c |= 32;
 739         if (c-'a'<6) return c-'a'+10;
 740         return -1;
 741 }
 742
 743 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 744 {
 745         if (node->submatch_id >= 0) {
 746                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 747                 if (!n)
 748                         return REG_ESPACE;
 749                 n = tre_ast_new_catenation(ctx->mem, n, node);
 750                 if (!n)
 751                         return REG_ESPACE;
 752                 n->num_submatches = node->num_submatches;
 753                 node = n;
 754         }
 755         node->submatch_id = subid;
 756         node->num_submatches++;
 757         ctx->n = node;
 758         return REG_OK;
 759 }
 760
 761 /*
 762 BRE grammar:
 763 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 764 Branch =  Atom  |  Branch Atom
 765 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 766 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 767
 768 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 769
 770 ERE grammar:
 771 Regex  =  Branch  |  Regex '|' Branch
 772 Branch =  Atom  |  Branch Atom
 773 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 774 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 775
 776 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 777 */
 778
 779 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 780 {
 781         int len, ere = ctx->cflags & REG_EXTENDED;
 782         const char *p;
 783         tre_ast_node_t *node;
 784         wchar_t wc;
 785         switch (*s) {
 786         case '[':
 787                 return parse_bracket(ctx, s+1);
 788         case '\\':
 789                 p = tre_expand_macro(s+1);
 790                 if (p) {
 791                         /* assume \X expansion is a single atom */
 792                         reg_errcode_t err = parse_atom(ctx, p);
 793                         ctx->s = s+2;
 794                         return err;
 795                 }
 796                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 797                 switch (*++s) {
 798                 case 0:
 799                         return REG_EESCAPE;
 800                 case 'b':
 801                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 802                         break;
 803                 case 'B':
 804                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 805                         break;
 806                 case '<':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 808                         break;
 809                 case '>':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 811                         break;
 812                 case 'x':
 813                         s++;
 814                         int i, v = 0, c;
 815                         len = 2;
 816                         if (*s == '{') {
 817                                 len = 8;
 818                                 s++;
 819                         }
 820                         for (i=0; i<len && v<0x110000; i++) {
 821                                 c = hexval(s[i]);
 822                                 if (c < 0) break;
 823                                 v = 16*v + c;
 824                         }
 825                         s += i;
 826                         if (len == 8) {
 827                                 if (*s != '}')
 828                                         return REG_EBRACE;
 829                                 s++;
 830                         }
 831                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
 832                         s--;
 833                         break;
 834                 case '{':
 835                 case '+':
 836                 case '?':
 837                         /* extension: treat \+, \? as repetitions in BRE */
 838                         /* reject repetitions after empty expression in BRE */
 839                         if (!ere)
 840                                 return REG_BADRPT;
 841                 case '|':
 842                         /* extension: treat \| as alternation in BRE */
 843                         if (!ere) {
 844                                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 845                                 s--;
 846                                 goto end;
 847                         }
 848                         /* fallthrough */
 849                 default:
 850                         if (!ere && (unsigned)*s-'1' < 9) {
 851                                 /* back reference */
 852                                 int val = *s - '0';
 853                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
 854                                 ctx->max_backref = MAX(val, ctx->max_backref);
 855                         } else {
 856                                 /* extension: accept unknown escaped char
 857                                    as a literal */
 858                                 goto parse_literal;
 859                         }
 860                 }
 861                 s++;
 862                 break;
 863         case '.':
 864                 if (ctx->cflags & REG_NEWLINE) {
 865                         tre_ast_node_t *tmp1, *tmp2;
 866                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 867                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 868                         if (tmp1 && tmp2)
 869                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 870                         else
 871                                 node = 0;
 872                 } else {
 873                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 874                 }
 875                 s++;
 876                 break;
 877         case '^':
 878                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 879                 if (!ere && s != ctx->re)
 880                         goto parse_literal;
 881                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 882                 s++;
 883                 break;
 884         case '$':
 885                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 886                 if (!ere && s[1])
 887                         goto parse_literal;
 888                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 889                 s++;
 890                 break;
 891         case '*':
 892         case '{':
 893         case '+':
 894         case '?':
 895                 /* reject repetitions after empty expression in ERE */
 896                 if (ere)
 897                         return REG_BADRPT;
 898         case '|':
 899                 if (!ere)
 900                         goto parse_literal;
 901         case 0:
 902                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 903                 break;
 904         default:
 905 parse_literal:
 906                 len = mbtowc(&wc, s, -1);
 907                 if (len < 0)
 908                         return REG_BADPAT;
 909                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 910                         tre_ast_node_t *tmp1, *tmp2;
 911                         /* multiple opposite case characters are not supported */
 912                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 913                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 914                         if (tmp1 && tmp2)
 915                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 916                         else
 917                                 node = 0;
 918                 } else {
 919                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 920                 }
 921                 ctx->position++;
 922                 s += len;
 923                 break;
 924         }
 925 end:
 926         if (!node)
 927                 return REG_ESPACE;
 928         ctx->n = node;
 929         ctx->s = s;
 930         return REG_OK;
 931 }
 932
 933 #define PUSHPTR(err, s, v) do { \
 934         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 935                 return err; \
 936 } while(0)
 937
 938 #define PUSHINT(err, s, v) do { \
 939         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 940                 return err; \
 941 } while(0)
 942
 943 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 944 {
 945         tre_ast_node_t *nbranch=0, *nunion=0;
 946         int ere = ctx->cflags & REG_EXTENDED;
 947         const char *s = ctx->re;
 948         int subid = 0;
 949         int depth = 0;
 950         reg_errcode_t err;
 951         tre_stack_t *stack = ctx->stack;
 952
 953         PUSHINT(err, stack, subid++);
 954         for (;;) {
 955                 if ((!ere && *s == '\\' && s[1] == '(') ||
 956                     (ere && *s == '(')) {
 957                         PUSHPTR(err, stack, nunion);
 958                         PUSHPTR(err, stack, nbranch);
 959                         PUSHINT(err, stack, subid++);
 960                         s++;
 961                         if (!ere)
 962                                 s++;
 963                         depth++;
 964                         nbranch = nunion = 0;
 965                         continue;
 966                 }
 967                 if ((!ere && *s == '\\' && s[1] == ')') ||
 968                     (ere && *s == ')' && depth)) {
 969                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 970                         if (!ctx->n)
 971                                 return REG_ESPACE;
 972                 } else {
 973                         err = parse_atom(ctx, s);
 974                         if (err != REG_OK)
 975                                 return err;
 976                         s = ctx->s;
 977                 }
 978
 979         parse_iter:
 980                 for (;;) {
 981                         int min, max;
 982
 983                         if (*s!='\\' && *s!='*') {
 984                                 if (!ere)
 985                                         break;
 986                                 if (*s!='+' && *s!='?' && *s!='{')
 987                                         break;
 988                         }
 989                         if (*s=='\\' && ere)
 990                                 break;
 991                         /* extension: treat \+, \? as repetitions in BRE */
 992                         if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
 993                                 break;
 994                         if (*s=='\\')
 995                                 s++;
 996
 997                         /* extension: multiple consecutive *+?{,} is unspecified,
 998                            but (a+)+ has to be supported so accepting a++ makes
 999                            sense, note however that the RE_DUP_MAX limit can be
1000                            circumvented: (a{255}){255} uses a lot of memory.. */
1001                         if (*s=='{') {
1002                                 s = parse_dup(s+1, ere, &min, &max);
1003                                 if (!s)
1004                                         return REG_BADBR;
1005                         } else {
1006                                 min=0;
1007                                 max=-1;
1008                                 if (*s == '+')
1009                                         min = 1;
1010                                 if (*s == '?')
1011                                         max = 1;
1012                                 s++;
1013                         }
1014                         if (max == 0)
1015                                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1016                         else
1017                                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1018                         if (!ctx->n)
1019                                 return REG_ESPACE;
1020                 }
1021
1022                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1023                 if ((ere && *s == '|') ||
1024                     (ere && *s == ')' && depth) ||
1025                     (!ere && *s == '\\' && s[1] == ')') ||
1026                     /* extension: treat \| as alternation in BRE */
1027                     (!ere && *s == '\\' && s[1] == '|') ||
1028                     !*s) {
1029                         /* extension: empty branch is unspecified (), (|a), (a|)
1030                            here they are not rejected but match on empty string */
1031                         int c = *s;
1032                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1033                         nbranch = 0;
1034
1035                         if (c == '\\' && s[1] == '|') {
1036                                 s+=2;
1037                         } else if (c == '|') {
1038                                 s++;
1039                         } else {
1040                                 if (c == '\\') {
1041                                         if (!depth) return REG_EPAREN;
1042                                         s+=2;
1043                                 } else if (c == ')')
1044                                         s++;
1045                                 depth--;
1046                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1047                                 if (err != REG_OK)
1048                                         return err;
1049                                 if (!c && depth<0) {
1050                                         ctx->submatch_id = subid;
1051                                         return REG_OK;
1052                                 }
1053                                 if (!c || depth<0)
1054                                         return REG_EPAREN;
1055                                 nbranch = tre_stack_pop_voidptr(stack);
1056                                 nunion = tre_stack_pop_voidptr(stack);
1057                                 goto parse_iter;
1058                         }
1059                 }
1060         }
1061 }
1062
1063
1064 /***********************************************************************
1065  from tre-compile.c
1066 ***********************************************************************/
1067
1068
1069 /*
1070   TODO:
1071    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1072      function calls.
1073 */
1074
1075 /*
1076   Algorithms to setup tags so that submatch addressing can be done.
1077 */
1078
1079
1080 /* Inserts a catenation node to the root of the tree given in `node'.
1081    As the left child a new tag with number `tag_id' to `node' is added,
1082    and the right child is the old root. */
1083 static reg_errcode_t
1084 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1085 {
1086   tre_catenation_t *c;
1087
1088   c = tre_mem_alloc(mem, sizeof(*c));
1089   if (c == NULL)
1090     return REG_ESPACE;
1091   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1092   if (c->left == NULL)
1093     return REG_ESPACE;
1094   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1095   if (c->right == NULL)
1096     return REG_ESPACE;
1097
1098   c->right->obj = node->obj;
1099   c->right->type = node->type;
1100   c->right->nullable = -1;
1101   c->right->submatch_id = -1;
1102   c->right->firstpos = NULL;
1103   c->right->lastpos = NULL;
1104   c->right->num_tags = 0;
1105   node->obj = c;
1106   node->type = CATENATION;
1107   return REG_OK;
1108 }
1109
1110 /* Inserts a catenation node to the root of the tree given in `node'.
1111    As the right child a new tag with number `tag_id' to `node' is added,
1112    and the left child is the old root. */
1113 static reg_errcode_t
1114 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1115 {
1116   tre_catenation_t *c;
1117
1118   c = tre_mem_alloc(mem, sizeof(*c));
1119   if (c == NULL)
1120     return REG_ESPACE;
1121   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1122   if (c->right == NULL)
1123     return REG_ESPACE;
1124   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1125   if (c->left == NULL)
1126     return REG_ESPACE;
1127
1128   c->left->obj = node->obj;
1129   c->left->type = node->type;
1130   c->left->nullable = -1;
1131   c->left->submatch_id = -1;
1132   c->left->firstpos = NULL;
1133   c->left->lastpos = NULL;
1134   c->left->num_tags = 0;
1135   node->obj = c;
1136   node->type = CATENATION;
1137   return REG_OK;
1138 }
1139
1140 typedef enum {
1141   ADDTAGS_RECURSE,
1142   ADDTAGS_AFTER_ITERATION,
1143   ADDTAGS_AFTER_UNION_LEFT,
1144   ADDTAGS_AFTER_UNION_RIGHT,
1145   ADDTAGS_AFTER_CAT_LEFT,
1146   ADDTAGS_AFTER_CAT_RIGHT,
1147   ADDTAGS_SET_SUBMATCH_END
1148 } tre_addtags_symbol_t;
1149
1150
1151 typedef struct {
1152   int tag;
1153   int next_tag;
1154 } tre_tag_states_t;
1155
1156
1157 /* Go through `regset' and set submatch data for submatches that are
1158    using this tag. */
1159 static void
1160 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1161 {
1162   int i;
1163
1164   for (i = 0; regset[i] >= 0; i++)
1165     {
1166       int id = regset[i] / 2;
1167       int start = !(regset[i] % 2);
1168       if (start)
1169         tnfa->submatch_data[id].so_tag = tag;
1170       else
1171         tnfa->submatch_data[id].eo_tag = tag;
1172     }
1173   regset[0] = -1;
1174 }
1175
1176
1177 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1178    subexpressions marked for submatch addressing can be traced. */
1179 static reg_errcode_t
1180 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1181              tre_tnfa_t *tnfa)
1182 {
1183   reg_errcode_t status = REG_OK;
1184   tre_addtags_symbol_t symbol;
1185   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1186   int bottom = tre_stack_num_objects(stack);
1187   /* True for first pass (counting number of needed tags) */
1188   int first_pass = (mem == NULL || tnfa == NULL);
1189   int *regset, *orig_regset;
1190   int num_tags = 0; /* Total number of tags. */
1191   int num_minimals = 0;  /* Number of special minimal tags. */
1192   int tag = 0;      /* The tag that is to be added next. */
1193   int next_tag = 1; /* Next tag to use after this one. */
1194   int *parents;     /* Stack of submatches the current submatch is
1195                        contained in. */
1196   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1197   tre_tag_states_t *saved_states;
1198
1199   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1200   if (!first_pass)
1201     {
1202       tnfa->end_tag = 0;
1203       tnfa->minimal_tags[0] = -1;
1204     }
1205
1206   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1207   if (regset == NULL)
1208     return REG_ESPACE;
1209   regset[0] = -1;
1210   orig_regset = regset;
1211
1212   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1213   if (parents == NULL)
1214     {
1215       xfree(regset);
1216       return REG_ESPACE;
1217     }
1218   parents[0] = -1;
1219
1220   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1221   if (saved_states == NULL)
1222     {
1223       xfree(regset);
1224       xfree(parents);
1225       return REG_ESPACE;
1226     }
1227   else
1228     {
1229       unsigned int i;
1230       for (i = 0; i <= tnfa->num_submatches; i++)
1231         saved_states[i].tag = -1;
1232     }
1233
1234   STACK_PUSH(stack, voidptr, node);
1235   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1236
1237   while (tre_stack_num_objects(stack) > bottom)
1238     {
1239       if (status != REG_OK)
1240         break;
1241
1242       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1243       switch (symbol)
1244         {
1245
1246         case ADDTAGS_SET_SUBMATCH_END:
1247           {
1248             int id = tre_stack_pop_int(stack);
1249             int i;
1250
1251             /* Add end of this submatch to regset. */
1252             for (i = 0; regset[i] >= 0; i++);
1253             regset[i] = id * 2 + 1;
1254             regset[i + 1] = -1;
1255
1256             /* Pop this submatch from the parents stack. */
1257             for (i = 0; parents[i] >= 0; i++);
1258             parents[i - 1] = -1;
1259             break;
1260           }
1261
1262         case ADDTAGS_RECURSE:
1263           node = tre_stack_pop_voidptr(stack);
1264
1265           if (node->submatch_id >= 0)
1266             {
1267               int id = node->submatch_id;
1268               int i;
1269
1270
1271               /* Add start of this submatch to regset. */
1272               for (i = 0; regset[i] >= 0; i++);
1273               regset[i] = id * 2;
1274               regset[i + 1] = -1;
1275
1276               if (!first_pass)
1277                 {
1278                   for (i = 0; parents[i] >= 0; i++);
1279                   tnfa->submatch_data[id].parents = NULL;
1280                   if (i > 0)
1281                     {
1282                       int *p = xmalloc(sizeof(*p) * (i + 1));
1283                       if (p == NULL)
1284                         {
1285                           status = REG_ESPACE;
1286                           break;
1287                         }
1288                       assert(tnfa->submatch_data[id].parents == NULL);
1289                       tnfa->submatch_data[id].parents = p;
1290                       for (i = 0; parents[i] >= 0; i++)
1291                         p[i] = parents[i];
1292                       p[i] = -1;
1293                     }
1294                 }
1295
1296               /* Add end of this submatch to regset after processing this
1297                  node. */
1298               STACK_PUSHX(stack, int, node->submatch_id);
1299               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1300             }
1301
1302           switch (node->type)
1303             {
1304             case LITERAL:
1305               {
1306                 tre_literal_t *lit = node->obj;
1307
1308                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1309                   {
1310                     int i;
1311                     if (regset[0] >= 0)
1312                       {
1313                         /* Regset is not empty, so add a tag before the
1314                            literal or backref. */
1315                         if (!first_pass)
1316                           {
1317                             status = tre_add_tag_left(mem, node, tag);
1318                             tnfa->tag_directions[tag] = direction;
1319                             if (minimal_tag >= 0)
1320                               {
1321                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1322                                 tnfa->minimal_tags[i] = tag;
1323                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1324                                 tnfa->minimal_tags[i + 2] = -1;
1325                                 minimal_tag = -1;
1326                                 num_minimals++;
1327                               }
1328                             tre_purge_regset(regset, tnfa, tag);
1329                           }
1330                         else
1331                           {
1332                             node->num_tags = 1;
1333                           }
1334
1335                         regset[0] = -1;
1336                         tag = next_tag;
1337                         num_tags++;
1338                         next_tag++;
1339                       }
1340                   }
1341                 else
1342                   {
1343                     assert(!IS_TAG(lit));
1344                   }
1345                 break;
1346               }
1347             case CATENATION:
1348               {
1349                 tre_catenation_t *cat = node->obj;
1350                 tre_ast_node_t *left = cat->left;
1351                 tre_ast_node_t *right = cat->right;
1352                 int reserved_tag = -1;
1353
1354
1355                 /* After processing right child. */
1356                 STACK_PUSHX(stack, voidptr, node);
1357                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1358
1359                 /* Process right child. */
1360                 STACK_PUSHX(stack, voidptr, right);
1361                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1362
1363                 /* After processing left child. */
1364                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1365                 if (left->num_tags > 0 && right->num_tags > 0)
1366                   {
1367                     /* Reserve the next tag to the right child. */
1368                     reserved_tag = next_tag;
1369                     next_tag++;
1370                   }
1371                 STACK_PUSHX(stack, int, reserved_tag);
1372                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1373
1374                 /* Process left child. */
1375                 STACK_PUSHX(stack, voidptr, left);
1376                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1377
1378                 }
1379               break;
1380             case ITERATION:
1381               {
1382                 tre_iteration_t *iter = node->obj;
1383
1384                 if (first_pass)
1385                   {
1386                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1387                   }
1388                 else
1389                   {
1390                     STACK_PUSHX(stack, int, tag);
1391                     STACK_PUSHX(stack, int, iter->minimal);
1392                   }
1393                 STACK_PUSHX(stack, voidptr, node);
1394                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1395
1396                 STACK_PUSHX(stack, voidptr, iter->arg);
1397                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1398
1399                 /* Regset is not empty, so add a tag here. */
1400                 if (regset[0] >= 0 || iter->minimal)
1401                   {
1402                     if (!first_pass)
1403                       {
1404                         int i;
1405                         status = tre_add_tag_left(mem, node, tag);
1406                         if (iter->minimal)
1407                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1408                         else
1409                           tnfa->tag_directions[tag] = direction;
1410                         if (minimal_tag >= 0)
1411                           {
1412                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1413                             tnfa->minimal_tags[i] = tag;
1414                             tnfa->minimal_tags[i + 1] = minimal_tag;
1415                             tnfa->minimal_tags[i + 2] = -1;
1416                             minimal_tag = -1;
1417                             num_minimals++;
1418                           }
1419                         tre_purge_regset(regset, tnfa, tag);
1420                       }
1421
1422                     regset[0] = -1;
1423                     tag = next_tag;
1424                     num_tags++;
1425                     next_tag++;
1426                   }
1427                 direction = TRE_TAG_MINIMIZE;
1428               }
1429               break;
1430             case UNION:
1431               {
1432                 tre_union_t *uni = node->obj;
1433                 tre_ast_node_t *left = uni->left;
1434                 tre_ast_node_t *right = uni->right;
1435                 int left_tag;
1436                 int right_tag;
1437
1438                 if (regset[0] >= 0)
1439                   {
1440                     left_tag = next_tag;
1441                     right_tag = next_tag + 1;
1442                   }
1443                 else
1444                   {
1445                     left_tag = tag;
1446                     right_tag = next_tag;
1447                   }
1448
1449                 /* After processing right child. */
1450                 STACK_PUSHX(stack, int, right_tag);
1451                 STACK_PUSHX(stack, int, left_tag);
1452                 STACK_PUSHX(stack, voidptr, regset);
1453                 STACK_PUSHX(stack, int, regset[0] >= 0);
1454                 STACK_PUSHX(stack, voidptr, node);
1455                 STACK_PUSHX(stack, voidptr, right);
1456                 STACK_PUSHX(stack, voidptr, left);
1457                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1458
1459                 /* Process right child. */
1460                 STACK_PUSHX(stack, voidptr, right);
1461                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1462
1463                 /* After processing left child. */
1464                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1465
1466                 /* Process left child. */
1467                 STACK_PUSHX(stack, voidptr, left);
1468                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1469
1470                 /* Regset is not empty, so add a tag here. */
1471                 if (regset[0] >= 0)
1472                   {
1473                     if (!first_pass)
1474                       {
1475                         int i;
1476                         status = tre_add_tag_left(mem, node, tag);
1477                         tnfa->tag_directions[tag] = direction;
1478                         if (minimal_tag >= 0)
1479                           {
1480                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1481                             tnfa->minimal_tags[i] = tag;
1482                             tnfa->minimal_tags[i + 1] = minimal_tag;
1483                             tnfa->minimal_tags[i + 2] = -1;
1484                             minimal_tag = -1;
1485                             num_minimals++;
1486                           }
1487                         tre_purge_regset(regset, tnfa, tag);
1488                       }
1489
1490                     regset[0] = -1;
1491                     tag = next_tag;
1492                     num_tags++;
1493                     next_tag++;
1494                   }
1495
1496                 if (node->num_submatches > 0)
1497                   {
1498                     /* The next two tags are reserved for markers. */
1499                     next_tag++;
1500                     tag = next_tag;
1501                     next_tag++;
1502                   }
1503
1504                 break;
1505               }
1506             }
1507
1508           if (node->submatch_id >= 0)
1509             {
1510               int i;
1511               /* Push this submatch on the parents stack. */
1512               for (i = 0; parents[i] >= 0; i++);
1513               parents[i] = node->submatch_id;
1514               parents[i + 1] = -1;
1515             }
1516
1517           break; /* end case: ADDTAGS_RECURSE */
1518
1519         case ADDTAGS_AFTER_ITERATION:
1520           {
1521             int minimal = 0;
1522             int enter_tag;
1523             node = tre_stack_pop_voidptr(stack);
1524             if (first_pass)
1525               {
1526                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1527                   + tre_stack_pop_int(stack);
1528                 minimal_tag = -1;
1529               }
1530             else
1531               {
1532                 minimal = tre_stack_pop_int(stack);
1533                 enter_tag = tre_stack_pop_int(stack);
1534                 if (minimal)
1535                   minimal_tag = enter_tag;
1536               }
1537
1538             if (!first_pass)
1539               {
1540                 if (minimal)
1541                   direction = TRE_TAG_MINIMIZE;
1542                 else
1543                   direction = TRE_TAG_MAXIMIZE;
1544               }
1545             break;
1546           }
1547
1548         case ADDTAGS_AFTER_CAT_LEFT:
1549           {
1550             int new_tag = tre_stack_pop_int(stack);
1551             next_tag = tre_stack_pop_int(stack);
1552             if (new_tag >= 0)
1553               {
1554                 tag = new_tag;
1555               }
1556             break;
1557           }
1558
1559         case ADDTAGS_AFTER_CAT_RIGHT:
1560           node = tre_stack_pop_voidptr(stack);
1561           if (first_pass)
1562             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1563               + ((tre_catenation_t *)node->obj)->right->num_tags;
1564           break;
1565
1566         case ADDTAGS_AFTER_UNION_LEFT:
1567           /* Lift the bottom of the `regset' array so that when processing
1568              the right operand the items currently in the array are
1569              invisible.  The original bottom was saved at ADDTAGS_UNION and
1570              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1571           while (*regset >= 0)
1572             regset++;
1573           break;
1574
1575         case ADDTAGS_AFTER_UNION_RIGHT:
1576           {
1577             int added_tags, tag_left, tag_right;
1578             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1579             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1580             node = tre_stack_pop_voidptr(stack);
1581             added_tags = tre_stack_pop_int(stack);
1582             if (first_pass)
1583               {
1584                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1585                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1586                   + ((node->num_submatches > 0) ? 2 : 0);
1587               }
1588             regset = tre_stack_pop_voidptr(stack);
1589             tag_left = tre_stack_pop_int(stack);
1590             tag_right = tre_stack_pop_int(stack);
1591
1592             /* Add tags after both children, the left child gets a smaller
1593                tag than the right child.  This guarantees that we prefer
1594                the left child over the right child. */
1595             /* XXX - This is not always necessary (if the children have
1596                tags which must be seen for every match of that child). */
1597             /* XXX - Check if this is the only place where tre_add_tag_right
1598                is used.  If so, use tre_add_tag_left (putting the tag before
1599                the child as opposed after the child) and throw away
1600                tre_add_tag_right. */
1601             if (node->num_submatches > 0)
1602               {
1603                 if (!first_pass)
1604                   {
1605                     status = tre_add_tag_right(mem, left, tag_left);
1606                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1607                     if (status == REG_OK)
1608                       status = tre_add_tag_right(mem, right, tag_right);
1609                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1610                   }
1611                 num_tags += 2;
1612               }
1613             direction = TRE_TAG_MAXIMIZE;
1614             break;
1615           }
1616
1617         default:
1618           assert(0);
1619           break;
1620
1621         } /* end switch(symbol) */
1622     } /* end while(tre_stack_num_objects(stack) > bottom) */
1623
1624   if (!first_pass)
1625     tre_purge_regset(regset, tnfa, tag);
1626
1627   if (!first_pass && minimal_tag >= 0)
1628     {
1629       int i;
1630       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1631       tnfa->minimal_tags[i] = tag;
1632       tnfa->minimal_tags[i + 1] = minimal_tag;
1633       tnfa->minimal_tags[i + 2] = -1;
1634       minimal_tag = -1;
1635       num_minimals++;
1636     }
1637
1638   assert(tree->num_tags == num_tags);
1639   tnfa->end_tag = num_tags;
1640   tnfa->num_tags = num_tags;
1641   tnfa->num_minimals = num_minimals;
1642   xfree(orig_regset);
1643   xfree(parents);
1644   xfree(saved_states);
1645   return status;
1646 }
1647
1648
1649
1650 /*
1651   AST to TNFA compilation routines.
1652 */
1653
1654 typedef enum {
1655   COPY_RECURSE,
1656   COPY_SET_RESULT_PTR
1657 } tre_copyast_symbol_t;
1658
1659 /* Flags for tre_copy_ast(). */
1660 #define COPY_REMOVE_TAGS         1
1661 #define COPY_MAXIMIZE_FIRST_TAG  2
1662
1663 static reg_errcode_t
1664 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1665              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1666              tre_ast_node_t **copy, int *max_pos)
1667 {
1668   reg_errcode_t status = REG_OK;
1669   int bottom = tre_stack_num_objects(stack);
1670   int num_copied = 0;
1671   int first_tag = 1;
1672   tre_ast_node_t **result = copy;
1673   tre_copyast_symbol_t symbol;
1674
1675   STACK_PUSH(stack, voidptr, ast);
1676   STACK_PUSH(stack, int, COPY_RECURSE);
1677
1678   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1679     {
1680       tre_ast_node_t *node;
1681       if (status != REG_OK)
1682         break;
1683
1684       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1685       switch (symbol)
1686         {
1687         case COPY_SET_RESULT_PTR:
1688           result = tre_stack_pop_voidptr(stack);
1689           break;
1690         case COPY_RECURSE:
1691           node = tre_stack_pop_voidptr(stack);
1692           switch (node->type)
1693             {
1694             case LITERAL:
1695               {
1696                 tre_literal_t *lit = node->obj;
1697                 int pos = lit->position;
1698                 int min = lit->code_min;
1699                 int max = lit->code_max;
1700                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1701                   {
1702                     /* XXX - e.g. [ab] has only one position but two
1703                        nodes, so we are creating holes in the state space
1704                        here.  Not fatal, just wastes memory. */
1705                     pos += *pos_add;
1706                     num_copied++;
1707                   }
1708                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1709                   {
1710                     /* Change this tag to empty. */
1711                     min = EMPTY;
1712                     max = pos = -1;
1713                   }
1714                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1715                          && first_tag)
1716                   {
1717                     /* Maximize the first tag. */
1718                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1719                     first_tag = 0;
1720                   }
1721                 *result = tre_ast_new_literal(mem, min, max, pos);
1722                 if (*result == NULL)
1723                   status = REG_ESPACE;
1724                 else {
1725                   tre_literal_t *p = (*result)->obj;
1726                   p->class = lit->class;
1727                   p->neg_classes = lit->neg_classes;
1728                 }
1729
1730                 if (pos > *max_pos)
1731                   *max_pos = pos;
1732                 break;
1733               }
1734             case UNION:
1735               {
1736                 tre_union_t *uni = node->obj;
1737                 tre_union_t *tmp;
1738                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1739                 if (*result == NULL)
1740                   {
1741                     status = REG_ESPACE;
1742                     break;
1743                   }
1744                 tmp = (*result)->obj;
1745                 result = &tmp->left;
1746                 STACK_PUSHX(stack, voidptr, uni->right);
1747                 STACK_PUSHX(stack, int, COPY_RECURSE);
1748                 STACK_PUSHX(stack, voidptr, &tmp->right);
1749                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1750                 STACK_PUSHX(stack, voidptr, uni->left);
1751                 STACK_PUSHX(stack, int, COPY_RECURSE);
1752                 break;
1753               }
1754             case CATENATION:
1755               {
1756                 tre_catenation_t *cat = node->obj;
1757                 tre_catenation_t *tmp;
1758                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1759                 if (*result == NULL)
1760                   {
1761                     status = REG_ESPACE;
1762                     break;
1763                   }
1764                 tmp = (*result)->obj;
1765                 tmp->left = NULL;
1766                 tmp->right = NULL;
1767                 result = &tmp->left;
1768
1769                 STACK_PUSHX(stack, voidptr, cat->right);
1770                 STACK_PUSHX(stack, int, COPY_RECURSE);
1771                 STACK_PUSHX(stack, voidptr, &tmp->right);
1772                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773                 STACK_PUSHX(stack, voidptr, cat->left);
1774                 STACK_PUSHX(stack, int, COPY_RECURSE);
1775                 break;
1776               }
1777             case ITERATION:
1778               {
1779                 tre_iteration_t *iter = node->obj;
1780                 STACK_PUSHX(stack, voidptr, iter->arg);
1781                 STACK_PUSHX(stack, int, COPY_RECURSE);
1782                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1783                                            iter->max, iter->minimal);
1784                 if (*result == NULL)
1785                   {
1786                     status = REG_ESPACE;
1787                     break;
1788                   }
1789                 iter = (*result)->obj;
1790                 result = &iter->arg;
1791                 break;
1792               }
1793             default:
1794               assert(0);
1795               break;
1796             }
1797           break;
1798         }
1799     }
1800   *pos_add += num_copied;
1801   return status;
1802 }
1803
1804 typedef enum {
1805   EXPAND_RECURSE,
1806   EXPAND_AFTER_ITER
1807 } tre_expand_ast_symbol_t;
1808
1809 /* Expands each iteration node that has a finite nonzero minimum or maximum
1810    iteration count to a catenated sequence of copies of the node. */
1811 static reg_errcode_t
1812 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1813                int *position, tre_tag_direction_t *tag_directions)
1814 {
1815   reg_errcode_t status = REG_OK;
1816   int bottom = tre_stack_num_objects(stack);
1817   int pos_add = 0;
1818   int pos_add_total = 0;
1819   int max_pos = 0;
1820   int iter_depth = 0;
1821
1822   STACK_PUSHR(stack, voidptr, ast);
1823   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1824   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1825     {
1826       tre_ast_node_t *node;
1827       tre_expand_ast_symbol_t symbol;
1828
1829       if (status != REG_OK)
1830         break;
1831
1832       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1833       node = tre_stack_pop_voidptr(stack);
1834       switch (symbol)
1835         {
1836         case EXPAND_RECURSE:
1837           switch (node->type)
1838             {
1839             case LITERAL:
1840               {
1841                 tre_literal_t *lit= node->obj;
1842                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1843                   {
1844                     lit->position += pos_add;
1845                     if (lit->position > max_pos)
1846                       max_pos = lit->position;
1847                   }
1848                 break;
1849               }
1850             case UNION:
1851               {
1852                 tre_union_t *uni = node->obj;
1853                 STACK_PUSHX(stack, voidptr, uni->right);
1854                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1855                 STACK_PUSHX(stack, voidptr, uni->left);
1856                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1857                 break;
1858               }
1859             case CATENATION:
1860               {
1861                 tre_catenation_t *cat = node->obj;
1862                 STACK_PUSHX(stack, voidptr, cat->right);
1863                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1864                 STACK_PUSHX(stack, voidptr, cat->left);
1865                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1866                 break;
1867               }
1868             case ITERATION:
1869               {
1870                 tre_iteration_t *iter = node->obj;
1871                 STACK_PUSHX(stack, int, pos_add);
1872                 STACK_PUSHX(stack, voidptr, node);
1873                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1874                 STACK_PUSHX(stack, voidptr, iter->arg);
1875                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1876                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1877                    then don't increase the `pos' fields of the nodes now, it
1878                    will get done when expanding. */
1879                 if (iter->min > 1 || iter->max > 1)
1880                   pos_add = 0;
1881                 iter_depth++;
1882                 break;
1883               }
1884             default:
1885               assert(0);
1886               break;
1887             }
1888           break;
1889         case EXPAND_AFTER_ITER:
1890           {
1891             tre_iteration_t *iter = node->obj;
1892             int pos_add_last;
1893             pos_add = tre_stack_pop_int(stack);
1894             pos_add_last = pos_add;
1895             if (iter->min > 1 || iter->max > 1)
1896               {
1897                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1898                 int j;
1899                 int pos_add_save = pos_add;
1900
1901                 /* Create a catenated sequence of copies of the node. */
1902                 for (j = 0; j < iter->min; j++)
1903                   {
1904                     tre_ast_node_t *copy;
1905                     /* Remove tags from all but the last copy. */
1906                     int flags = ((j + 1 < iter->min)
1907                                  ? COPY_REMOVE_TAGS
1908                                  : COPY_MAXIMIZE_FIRST_TAG);
1909                     pos_add_save = pos_add;
1910                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1911                                           &pos_add, tag_directions, &copy,
1912                                           &max_pos);
1913                     if (status != REG_OK)
1914                       return status;
1915                     if (seq1 != NULL)
1916                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1917                     else
1918                       seq1 = copy;
1919                     if (seq1 == NULL)
1920                       return REG_ESPACE;
1921                   }
1922
1923                 if (iter->max == -1)
1924                   {
1925                     /* No upper limit. */
1926                     pos_add_save = pos_add;
1927                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1928                                           &pos_add, NULL, &seq2, &max_pos);
1929                     if (status != REG_OK)
1930                       return status;
1931                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1932                     if (seq2 == NULL)
1933                       return REG_ESPACE;
1934                   }
1935                 else
1936                   {
1937                     for (j = iter->min; j < iter->max; j++)
1938                       {
1939                         tre_ast_node_t *tmp, *copy;
1940                         pos_add_save = pos_add;
1941                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1942                                               &pos_add, NULL, &copy, &max_pos);
1943                         if (status != REG_OK)
1944                           return status;
1945                         if (seq2 != NULL)
1946                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1947                         else
1948                           seq2 = copy;
1949                         if (seq2 == NULL)
1950                           return REG_ESPACE;
1951                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1952                         if (tmp == NULL)
1953                           return REG_ESPACE;
1954                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1955                         if (seq2 == NULL)
1956                           return REG_ESPACE;
1957                       }
1958                   }
1959
1960                 pos_add = pos_add_save;
1961                 if (seq1 == NULL)
1962                   seq1 = seq2;
1963                 else if (seq2 != NULL)
1964                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1965                 if (seq1 == NULL)
1966                   return REG_ESPACE;
1967                 node->obj = seq1->obj;
1968                 node->type = seq1->type;
1969               }
1970
1971             iter_depth--;
1972             pos_add_total += pos_add - pos_add_last;
1973             if (iter_depth == 0)
1974               pos_add = pos_add_total;
1975
1976             break;
1977           }
1978         default:
1979           assert(0);
1980           break;
1981         }
1982     }
1983
1984   *position += pos_add_total;
1985
1986   /* `max_pos' should never be larger than `*position' if the above
1987      code works, but just an extra safeguard let's make sure
1988      `*position' is set large enough so enough memory will be
1989      allocated for the transition table. */
1990   if (max_pos > *position)
1991     *position = max_pos;
1992
1993   return status;
1994 }
1995
1996 static tre_pos_and_tags_t *
1997 tre_set_empty(tre_mem_t mem)
1998 {
1999   tre_pos_and_tags_t *new_set;
2000
2001   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2002   if (new_set == NULL)
2003     return NULL;
2004
2005   new_set[0].position = -1;
2006   new_set[0].code_min = -1;
2007   new_set[0].code_max = -1;
2008
2009   return new_set;
2010 }
2011
2012 static tre_pos_and_tags_t *
2013 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2014             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2015 {
2016   tre_pos_and_tags_t *new_set;
2017
2018   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2019   if (new_set == NULL)
2020     return NULL;
2021
2022   new_set[0].position = position;
2023   new_set[0].code_min = code_min;
2024   new_set[0].code_max = code_max;
2025   new_set[0].class = class;
2026   new_set[0].neg_classes = neg_classes;
2027   new_set[0].backref = backref;
2028   new_set[1].position = -1;
2029   new_set[1].code_min = -1;
2030   new_set[1].code_max = -1;
2031
2032   return new_set;
2033 }
2034
2035 static tre_pos_and_tags_t *
2036 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2037               int *tags, int assertions)
2038 {
2039   int s1, s2, i, j;
2040   tre_pos_and_tags_t *new_set;
2041   int *new_tags;
2042   int num_tags;
2043
2044   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2045   for (s1 = 0; set1[s1].position >= 0; s1++);
2046   for (s2 = 0; set2[s2].position >= 0; s2++);
2047   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2048   if (!new_set )
2049     return NULL;
2050
2051   for (s1 = 0; set1[s1].position >= 0; s1++)
2052     {
2053       new_set[s1].position = set1[s1].position;
2054       new_set[s1].code_min = set1[s1].code_min;
2055       new_set[s1].code_max = set1[s1].code_max;
2056       new_set[s1].assertions = set1[s1].assertions | assertions;
2057       new_set[s1].class = set1[s1].class;
2058       new_set[s1].neg_classes = set1[s1].neg_classes;
2059       new_set[s1].backref = set1[s1].backref;
2060       if (set1[s1].tags == NULL && tags == NULL)
2061         new_set[s1].tags = NULL;
2062       else
2063         {
2064           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2065           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2066                                          * (i + num_tags + 1)));
2067           if (new_tags == NULL)
2068             return NULL;
2069           for (j = 0; j < i; j++)
2070             new_tags[j] = set1[s1].tags[j];
2071           for (i = 0; i < num_tags; i++)
2072             new_tags[j + i] = tags[i];
2073           new_tags[j + i] = -1;
2074           new_set[s1].tags = new_tags;
2075         }
2076     }
2077
2078   for (s2 = 0; set2[s2].position >= 0; s2++)
2079     {
2080       new_set[s1 + s2].position = set2[s2].position;
2081       new_set[s1 + s2].code_min = set2[s2].code_min;
2082       new_set[s1 + s2].code_max = set2[s2].code_max;
2083       /* XXX - why not | assertions here as well? */
2084       new_set[s1 + s2].assertions = set2[s2].assertions;
2085       new_set[s1 + s2].class = set2[s2].class;
2086       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2087       new_set[s1 + s2].backref = set2[s2].backref;
2088       if (set2[s2].tags == NULL)
2089         new_set[s1 + s2].tags = NULL;
2090       else
2091         {
2092           for (i = 0; set2[s2].tags[i] >= 0; i++);
2093           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2094           if (new_tags == NULL)
2095             return NULL;
2096           for (j = 0; j < i; j++)
2097             new_tags[j] = set2[s2].tags[j];
2098           new_tags[j] = -1;
2099           new_set[s1 + s2].tags = new_tags;
2100         }
2101     }
2102   new_set[s1 + s2].position = -1;
2103   return new_set;
2104 }
2105
2106 /* Finds the empty path through `node' which is the one that should be
2107    taken according to POSIX.2 rules, and adds the tags on that path to
2108    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2109    set to the number of tags seen on the path. */
2110 static reg_errcode_t
2111 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2112                 int *assertions, int *num_tags_seen)
2113 {
2114   tre_literal_t *lit;
2115   tre_union_t *uni;
2116   tre_catenation_t *cat;
2117   tre_iteration_t *iter;
2118   int i;
2119   int bottom = tre_stack_num_objects(stack);
2120   reg_errcode_t status = REG_OK;
2121   if (num_tags_seen)
2122     *num_tags_seen = 0;
2123
2124   status = tre_stack_push_voidptr(stack, node);
2125
2126   /* Walk through the tree recursively. */
2127   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2128     {
2129       node = tre_stack_pop_voidptr(stack);
2130
2131       switch (node->type)
2132         {
2133         case LITERAL:
2134           lit = (tre_literal_t *)node->obj;
2135           switch (lit->code_min)
2136             {
2137             case TAG:
2138               if (lit->code_max >= 0)
2139                 {
2140                   if (tags != NULL)
2141                     {
2142                       /* Add the tag to `tags'. */
2143                       for (i = 0; tags[i] >= 0; i++)
2144                         if (tags[i] == lit->code_max)
2145                           break;
2146                       if (tags[i] < 0)
2147                         {
2148                           tags[i] = lit->code_max;
2149                           tags[i + 1] = -1;
2150                         }
2151                     }
2152                   if (num_tags_seen)
2153                     (*num_tags_seen)++;
2154                 }
2155               break;
2156             case ASSERTION:
2157               assert(lit->code_max >= 1
2158                      || lit->code_max <= ASSERT_LAST);
2159               if (assertions != NULL)
2160                 *assertions |= lit->code_max;
2161               break;
2162             case EMPTY:
2163               break;
2164             default:
2165               assert(0);
2166               break;
2167             }
2168           break;
2169
2170         case UNION:
2171           /* Subexpressions starting earlier take priority over ones
2172              starting later, so we prefer the left subexpression over the
2173              right subexpression. */
2174           uni = (tre_union_t *)node->obj;
2175           if (uni->left->nullable)
2176             STACK_PUSHX(stack, voidptr, uni->left)
2177           else if (uni->right->nullable)
2178             STACK_PUSHX(stack, voidptr, uni->right)
2179           else
2180             assert(0);
2181           break;
2182
2183         case CATENATION:
2184           /* The path must go through both children. */
2185           cat = (tre_catenation_t *)node->obj;
2186           assert(cat->left->nullable);
2187           assert(cat->right->nullable);
2188           STACK_PUSHX(stack, voidptr, cat->left);
2189           STACK_PUSHX(stack, voidptr, cat->right);
2190           break;
2191
2192         case ITERATION:
2193           /* A match with an empty string is preferred over no match at
2194              all, so we go through the argument if possible. */
2195           iter = (tre_iteration_t *)node->obj;
2196           if (iter->arg->nullable)
2197             STACK_PUSHX(stack, voidptr, iter->arg);
2198           break;
2199
2200         default:
2201           assert(0);
2202           break;
2203         }
2204     }
2205
2206   return status;
2207 }
2208
2209
2210 typedef enum {
2211   NFL_RECURSE,
2212   NFL_POST_UNION,
2213   NFL_POST_CATENATION,
2214   NFL_POST_ITERATION
2215 } tre_nfl_stack_symbol_t;
2216
2217
2218 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2219    the nodes of the AST `tree'. */
2220 static reg_errcode_t
2221 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2222 {
2223   int bottom = tre_stack_num_objects(stack);
2224
2225   STACK_PUSHR(stack, voidptr, tree);
2226   STACK_PUSHR(stack, int, NFL_RECURSE);
2227
2228   while (tre_stack_num_objects(stack) > bottom)
2229     {
2230       tre_nfl_stack_symbol_t symbol;
2231       tre_ast_node_t *node;
2232
2233       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2234       node = tre_stack_pop_voidptr(stack);
2235       switch (symbol)
2236         {
2237         case NFL_RECURSE:
2238           switch (node->type)
2239             {
2240             case LITERAL:
2241               {
2242                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2243                 if (IS_BACKREF(lit))
2244                   {
2245                     /* Back references: nullable = false, firstpos = {i},
2246                        lastpos = {i}. */
2247                     node->nullable = 0;
2248                     node->firstpos = tre_set_one(mem, lit->position, 0,
2249                                              TRE_CHAR_MAX, 0, NULL, -1);
2250                     if (!node->firstpos)
2251                       return REG_ESPACE;
2252                     node->lastpos = tre_set_one(mem, lit->position, 0,
2253                                                 TRE_CHAR_MAX, 0, NULL,
2254                                                 (int)lit->code_max);
2255                     if (!node->lastpos)
2256                       return REG_ESPACE;
2257                   }
2258                 else if (lit->code_min < 0)
2259                   {
2260                     /* Tags, empty strings, params, and zero width assertions:
2261                        nullable = true, firstpos = {}, and lastpos = {}. */
2262                     node->nullable = 1;
2263                     node->firstpos = tre_set_empty(mem);
2264                     if (!node->firstpos)
2265                       return REG_ESPACE;
2266                     node->lastpos = tre_set_empty(mem);
2267                     if (!node->lastpos)
2268                       return REG_ESPACE;
2269                   }
2270                 else
2271                   {
2272                     /* Literal at position i: nullable = false, firstpos = {i},
2273                        lastpos = {i}. */
2274                     node->nullable = 0;
2275                     node->firstpos =
2276                       tre_set_one(mem, lit->position, (int)lit->code_min,
2277                                   (int)lit->code_max, 0, NULL, -1);
2278                     if (!node->firstpos)
2279                       return REG_ESPACE;
2280                     node->lastpos = tre_set_one(mem, lit->position,
2281                                                 (int)lit->code_min,
2282                                                 (int)lit->code_max,
2283                                                 lit->class, lit->neg_classes,
2284                                                 -1);
2285                     if (!node->lastpos)
2286                       return REG_ESPACE;
2287                   }
2288                 break;
2289               }
2290
2291             case UNION:
2292               /* Compute the attributes for the two subtrees, and after that
2293                  for this node. */
2294               STACK_PUSHR(stack, voidptr, node);
2295               STACK_PUSHR(stack, int, NFL_POST_UNION);
2296               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2297               STACK_PUSHR(stack, int, NFL_RECURSE);
2298               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2299               STACK_PUSHR(stack, int, NFL_RECURSE);
2300               break;
2301
2302             case CATENATION:
2303               /* Compute the attributes for the two subtrees, and after that
2304                  for this node. */
2305               STACK_PUSHR(stack, voidptr, node);
2306               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2307               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2308               STACK_PUSHR(stack, int, NFL_RECURSE);
2309               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2310               STACK_PUSHR(stack, int, NFL_RECURSE);
2311               break;
2312
2313             case ITERATION:
2314               /* Compute the attributes for the subtree, and after that for
2315                  this node. */
2316               STACK_PUSHR(stack, voidptr, node);
2317               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2318               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2319               STACK_PUSHR(stack, int, NFL_RECURSE);
2320               break;
2321             }
2322           break; /* end case: NFL_RECURSE */
2323
2324         case NFL_POST_UNION:
2325           {
2326             tre_union_t *uni = (tre_union_t *)node->obj;
2327             node->nullable = uni->left->nullable || uni->right->nullable;
2328             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2329                                            uni->right->firstpos, NULL, 0);
2330             if (!node->firstpos)
2331               return REG_ESPACE;
2332             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2333                                           uni->right->lastpos, NULL, 0);
2334             if (!node->lastpos)
2335               return REG_ESPACE;
2336             break;
2337           }
2338
2339         case NFL_POST_ITERATION:
2340           {
2341             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2342
2343             if (iter->min == 0 || iter->arg->nullable)
2344               node->nullable = 1;
2345             else
2346               node->nullable = 0;
2347             node->firstpos = iter->arg->firstpos;
2348             node->lastpos = iter->arg->lastpos;
2349             break;
2350           }
2351
2352         case NFL_POST_CATENATION:
2353           {
2354             int num_tags, *tags, assertions;
2355             reg_errcode_t status;
2356             tre_catenation_t *cat = node->obj;
2357             node->nullable = cat->left->nullable && cat->right->nullable;
2358
2359             /* Compute firstpos. */
2360             if (cat->left->nullable)
2361               {
2362                 /* The left side matches the empty string.  Make a first pass
2363                    with tre_match_empty() to get the number of tags and
2364                    parameters. */
2365                 status = tre_match_empty(stack, cat->left,
2366                                          NULL, NULL, &num_tags);
2367                 if (status != REG_OK)
2368                   return status;
2369                 /* Allocate arrays for the tags and parameters. */
2370                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2371                 if (!tags)
2372                   return REG_ESPACE;
2373                 tags[0] = -1;
2374                 assertions = 0;
2375                 /* Second pass with tre_mach_empty() to get the list of
2376                    tags and parameters. */
2377                 status = tre_match_empty(stack, cat->left, tags,
2378                                          &assertions, NULL);
2379                 if (status != REG_OK)
2380                   {
2381                     xfree(tags);
2382                     return status;
2383                   }
2384                 node->firstpos =
2385                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2386                                 tags, assertions);
2387                 xfree(tags);
2388                 if (!node->firstpos)
2389                   return REG_ESPACE;
2390               }
2391             else
2392               {
2393                 node->firstpos = cat->left->firstpos;
2394               }
2395
2396             /* Compute lastpos. */
2397             if (cat->right->nullable)
2398               {
2399                 /* The right side matches the empty string.  Make a first pass
2400                    with tre_match_empty() to get the number of tags and
2401                    parameters. */
2402                 status = tre_match_empty(stack, cat->right,
2403                                          NULL, NULL, &num_tags);
2404                 if (status != REG_OK)
2405                   return status;
2406                 /* Allocate arrays for the tags and parameters. */
2407                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2408                 if (!tags)
2409                   return REG_ESPACE;
2410                 tags[0] = -1;
2411                 assertions = 0;
2412                 /* Second pass with tre_mach_empty() to get the list of
2413                    tags and parameters. */
2414                 status = tre_match_empty(stack, cat->right, tags,
2415                                          &assertions, NULL);
2416                 if (status != REG_OK)
2417                   {
2418                     xfree(tags);
2419                     return status;
2420                   }
2421                 node->lastpos =
2422                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2423                                 tags, assertions);
2424                 xfree(tags);
2425                 if (!node->lastpos)
2426                   return REG_ESPACE;
2427               }
2428             else
2429               {
2430                 node->lastpos = cat->right->lastpos;
2431               }
2432             break;
2433           }
2434
2435         default:
2436           assert(0);
2437           break;
2438         }
2439     }
2440
2441   return REG_OK;
2442 }
2443
2444
2445 /* Adds a transition from each position in `p1' to each position in `p2'. */
2446 static reg_errcode_t
2447 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2448                tre_tnfa_transition_t *transitions,
2449                int *counts, int *offs)
2450 {
2451   tre_pos_and_tags_t *orig_p2 = p2;
2452   tre_tnfa_transition_t *trans;
2453   int i, j, k, l, dup, prev_p2_pos;
2454
2455   if (transitions != NULL)
2456     while (p1->position >= 0)
2457       {
2458         p2 = orig_p2;
2459         prev_p2_pos = -1;
2460         while (p2->position >= 0)
2461           {
2462             /* Optimization: if this position was already handled, skip it. */
2463             if (p2->position == prev_p2_pos)
2464               {
2465                 p2++;
2466                 continue;
2467               }
2468             prev_p2_pos = p2->position;
2469             /* Set `trans' to point to the next unused transition from
2470                position `p1->position'. */
2471             trans = transitions + offs[p1->position];
2472             while (trans->state != NULL)
2473               {
2474 #if 0
2475                 /* If we find a previous transition from `p1->position' to
2476                    `p2->position', it is overwritten.  This can happen only
2477                    if there are nested loops in the regexp, like in "((a)*)*".
2478                    In POSIX.2 repetition using the outer loop is always
2479                    preferred over using the inner loop.  Therefore the
2480                    transition for the inner loop is useless and can be thrown
2481                    away. */
2482                 /* XXX - The same position is used for all nodes in a bracket
2483                    expression, so this optimization cannot be used (it will
2484                    break bracket expressions) unless I figure out a way to
2485                    detect it here. */
2486                 if (trans->state_id == p2->position)
2487                   {
2488                     break;
2489                   }
2490 #endif
2491                 trans++;
2492               }
2493
2494             if (trans->state == NULL)
2495               (trans + 1)->state = NULL;
2496             /* Use the character ranges, assertions, etc. from `p1' for
2497                the transition from `p1' to `p2'. */
2498             trans->code_min = p1->code_min;
2499             trans->code_max = p1->code_max;
2500             trans->state = transitions + offs[p2->position];
2501             trans->state_id = p2->position;
2502             trans->assertions = p1->assertions | p2->assertions
2503               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2504               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2505             if (p1->backref >= 0)
2506               {
2507                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2508                 assert(p2->backref < 0);
2509                 trans->u.backref = p1->backref;
2510                 trans->assertions |= ASSERT_BACKREF;
2511               }
2512             else
2513               trans->u.class = p1->class;
2514             if (p1->neg_classes != NULL)
2515               {
2516                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2517                 trans->neg_classes =
2518                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2519                 if (trans->neg_classes == NULL)
2520                   return REG_ESPACE;
2521                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2522                   trans->neg_classes[i] = p1->neg_classes[i];
2523                 trans->neg_classes[i] = (tre_ctype_t)0;
2524               }
2525             else
2526               trans->neg_classes = NULL;
2527
2528             /* Find out how many tags this transition has. */
2529             i = 0;
2530             if (p1->tags != NULL)
2531               while(p1->tags[i] >= 0)
2532                 i++;
2533             j = 0;
2534             if (p2->tags != NULL)
2535               while(p2->tags[j] >= 0)
2536                 j++;
2537
2538             /* If we are overwriting a transition, free the old tag array. */
2539             if (trans->tags != NULL)
2540               xfree(trans->tags);
2541             trans->tags = NULL;
2542
2543             /* If there were any tags, allocate an array and fill it. */
2544             if (i + j > 0)
2545               {
2546                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2547                 if (!trans->tags)
2548                   return REG_ESPACE;
2549                 i = 0;
2550                 if (p1->tags != NULL)
2551                   while(p1->tags[i] >= 0)
2552                     {
2553                       trans->tags[i] = p1->tags[i];
2554                       i++;
2555                     }
2556                 l = i;
2557                 j = 0;
2558                 if (p2->tags != NULL)
2559                   while (p2->tags[j] >= 0)
2560                     {
2561                       /* Don't add duplicates. */
2562                       dup = 0;
2563                       for (k = 0; k < i; k++)
2564                         if (trans->tags[k] == p2->tags[j])
2565                           {
2566                             dup = 1;
2567                             break;
2568                           }
2569                       if (!dup)
2570                         trans->tags[l++] = p2->tags[j];
2571                       j++;
2572                     }
2573                 trans->tags[l] = -1;
2574               }
2575
2576             p2++;
2577           }
2578         p1++;
2579       }
2580   else
2581     /* Compute a maximum limit for the number of transitions leaving
2582        from each state. */
2583     while (p1->position >= 0)
2584       {
2585         p2 = orig_p2;
2586         while (p2->position >= 0)
2587           {
2588             counts[p1->position]++;
2589             p2++;
2590           }
2591         p1++;
2592       }
2593   return REG_OK;
2594 }
2595
2596 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2597    labelled with one character range (there are no transitions on empty
2598    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2599    the regexp. */
2600 static reg_errcode_t
2601 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2602                 int *counts, int *offs)
2603 {
2604   tre_union_t *uni;
2605   tre_catenation_t *cat;
2606   tre_iteration_t *iter;
2607   reg_errcode_t errcode = REG_OK;
2608
2609   /* XXX - recurse using a stack!. */
2610   switch (node->type)
2611     {
2612     case LITERAL:
2613       break;
2614     case UNION:
2615       uni = (tre_union_t *)node->obj;
2616       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2617       if (errcode != REG_OK)
2618         return errcode;
2619       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2620       break;
2621
2622     case CATENATION:
2623       cat = (tre_catenation_t *)node->obj;
2624       /* Add a transition from each position in cat->left->lastpos
2625          to each position in cat->right->firstpos. */
2626       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2627                                transitions, counts, offs);
2628       if (errcode != REG_OK)
2629         return errcode;
2630       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2631       if (errcode != REG_OK)
2632         return errcode;
2633       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2634       break;
2635
2636     case ITERATION:
2637       iter = (tre_iteration_t *)node->obj;
2638       assert(iter->max == -1 || iter->max == 1);
2639
2640       if (iter->max == -1)
2641         {
2642           assert(iter->min == 0 || iter->min == 1);
2643           /* Add a transition from each last position in the iterated
2644              expression to each first position. */
2645           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2646                                    transitions, counts, offs);
2647           if (errcode != REG_OK)
2648             return errcode;
2649         }
2650       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2651       break;
2652     }
2653   return errcode;
2654 }
2655
2656
2657 #define ERROR_EXIT(err)           \
2658   do                              \
2659     {                             \
2660       errcode = err;              \
2661       if (/*CONSTCOND*/1)         \
2662         goto error_exit;          \
2663     }                             \
2664  while (/*CONSTCOND*/0)
2665
2666
2667 int
2668 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2669 {
2670   tre_stack_t *stack;
2671   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2672   tre_pos_and_tags_t *p;
2673   int *counts = NULL, *offs = NULL;
2674   int i, add = 0;
2675   tre_tnfa_transition_t *transitions, *initial;
2676   tre_tnfa_t *tnfa = NULL;
2677   tre_submatch_data_t *submatch_data;
2678   tre_tag_direction_t *tag_directions = NULL;
2679   reg_errcode_t errcode;
2680   tre_mem_t mem;
2681
2682   /* Parse context. */
2683   tre_parse_ctx_t parse_ctx;
2684
2685   /* Allocate a stack used throughout the compilation process for various
2686      purposes. */
2687   stack = tre_stack_new(512, 1024000, 128);
2688   if (!stack)
2689     return REG_ESPACE;
2690   /* Allocate a fast memory allocator. */
2691   mem = tre_mem_new();
2692   if (!mem)
2693     {
2694       tre_stack_destroy(stack);
2695       return REG_ESPACE;
2696     }
2697
2698   /* Parse the regexp. */
2699   memset(&parse_ctx, 0, sizeof(parse_ctx));
2700   parse_ctx.mem = mem;
2701   parse_ctx.stack = stack;
2702   parse_ctx.re = regex;
2703   parse_ctx.cflags = cflags;
2704   parse_ctx.max_backref = -1;
2705   errcode = tre_parse(&parse_ctx);
2706   if (errcode != REG_OK)
2707     ERROR_EXIT(errcode);
2708   preg->re_nsub = parse_ctx.submatch_id - 1;
2709   tree = parse_ctx.n;
2710
2711 #ifdef TRE_DEBUG
2712   tre_ast_print(tree);
2713 #endif /* TRE_DEBUG */
2714
2715   /* Referring to nonexistent subexpressions is illegal. */
2716   if (parse_ctx.max_backref > (int)preg->re_nsub)
2717     ERROR_EXIT(REG_ESUBREG);
2718
2719   /* Allocate the TNFA struct. */
2720   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2721   if (tnfa == NULL)
2722     ERROR_EXIT(REG_ESPACE);
2723   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2724   tnfa->have_approx = 0;
2725   tnfa->num_submatches = parse_ctx.submatch_id;
2726
2727   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2728      regexp does not have back references, this can be skipped. */
2729   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2730     {
2731
2732       /* Figure out how many tags we will need. */
2733       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2734       if (errcode != REG_OK)
2735         ERROR_EXIT(errcode);
2736
2737       if (tnfa->num_tags > 0)
2738         {
2739           tag_directions = xmalloc(sizeof(*tag_directions)
2740                                    * (tnfa->num_tags + 1));
2741           if (tag_directions == NULL)
2742             ERROR_EXIT(REG_ESPACE);
2743           tnfa->tag_directions = tag_directions;
2744           memset(tag_directions, -1,
2745                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2746         }
2747       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2748                                    sizeof(*tnfa->minimal_tags));
2749       if (tnfa->minimal_tags == NULL)
2750         ERROR_EXIT(REG_ESPACE);
2751
2752       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2753                               sizeof(*submatch_data));
2754       if (submatch_data == NULL)
2755         ERROR_EXIT(REG_ESPACE);
2756       tnfa->submatch_data = submatch_data;
2757
2758       errcode = tre_add_tags(mem, stack, tree, tnfa);
2759       if (errcode != REG_OK)
2760         ERROR_EXIT(errcode);
2761
2762     }
2763
2764   /* Expand iteration nodes. */
2765   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2766                            tag_directions);
2767   if (errcode != REG_OK)
2768     ERROR_EXIT(errcode);
2769
2770   /* Add a dummy node for the final state.
2771      XXX - For certain patterns this dummy node can be optimized away,
2772            for example "a*" or "ab*".   Figure out a simple way to detect
2773            this possibility. */
2774   tmp_ast_l = tree;
2775   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2776   if (tmp_ast_r == NULL)
2777     ERROR_EXIT(REG_ESPACE);
2778
2779   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2780   if (tree == NULL)
2781     ERROR_EXIT(REG_ESPACE);
2782
2783   errcode = tre_compute_nfl(mem, stack, tree);
2784   if (errcode != REG_OK)
2785     ERROR_EXIT(errcode);
2786
2787   counts = xmalloc(sizeof(int) * parse_ctx.position);
2788   if (counts == NULL)
2789     ERROR_EXIT(REG_ESPACE);
2790
2791   offs = xmalloc(sizeof(int) * parse_ctx.position);
2792   if (offs == NULL)
2793     ERROR_EXIT(REG_ESPACE);
2794
2795   for (i = 0; i < parse_ctx.position; i++)
2796     counts[i] = 0;
2797   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2798
2799   add = 0;
2800   for (i = 0; i < parse_ctx.position; i++)
2801     {
2802       offs[i] = add;
2803       add += counts[i] + 1;
2804       counts[i] = 0;
2805     }
2806   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2807   if (transitions == NULL)
2808     ERROR_EXIT(REG_ESPACE);
2809   tnfa->transitions = transitions;
2810   tnfa->num_transitions = add;
2811
2812   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2813   if (errcode != REG_OK)
2814     ERROR_EXIT(errcode);
2815
2816   tnfa->firstpos_chars = NULL;
2817
2818   p = tree->firstpos;
2819   i = 0;
2820   while (p->position >= 0)
2821     {
2822       i++;
2823       p++;
2824     }
2825
2826   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2827   if (initial == NULL)
2828     ERROR_EXIT(REG_ESPACE);
2829   tnfa->initial = initial;
2830
2831   i = 0;
2832   for (p = tree->firstpos; p->position >= 0; p++)
2833     {
2834       initial[i].state = transitions + offs[p->position];
2835       initial[i].state_id = p->position;
2836       initial[i].tags = NULL;
2837       /* Copy the arrays p->tags, and p->params, they are allocated
2838          from a tre_mem object. */
2839       if (p->tags)
2840         {
2841           int j;
2842           for (j = 0; p->tags[j] >= 0; j++);
2843           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2844           if (!initial[i].tags)
2845             ERROR_EXIT(REG_ESPACE);
2846           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2847         }
2848       initial[i].assertions = p->assertions;
2849       i++;
2850     }
2851   initial[i].state = NULL;
2852
2853   tnfa->num_transitions = add;
2854   tnfa->final = transitions + offs[tree->lastpos[0].position];
2855   tnfa->num_states = parse_ctx.position;
2856   tnfa->cflags = cflags;
2857
2858   tre_mem_destroy(mem);
2859   tre_stack_destroy(stack);
2860   xfree(counts);
2861   xfree(offs);
2862
2863   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2864   return REG_OK;
2865
2866  error_exit:
2867   /* Free everything that was allocated and return the error code. */
2868   tre_mem_destroy(mem);
2869   if (stack != NULL)
2870     tre_stack_destroy(stack);
2871   if (counts != NULL)
2872     xfree(counts);
2873   if (offs != NULL)
2874     xfree(offs);
2875   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2876   regfree(preg);
2877   return errcode;
2878 }
2879
2880
2881
2882
2883 void
2884 regfree(regex_t *preg)
2885 {
2886   tre_tnfa_t *tnfa;
2887   unsigned int i;
2888   tre_tnfa_transition_t *trans;
2889
2890   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2891   if (!tnfa)
2892     return;
2893
2894   for (i = 0; i < tnfa->num_transitions; i++)
2895     if (tnfa->transitions[i].state)
2896       {
2897         if (tnfa->transitions[i].tags)
2898           xfree(tnfa->transitions[i].tags);
2899         if (tnfa->transitions[i].neg_classes)
2900           xfree(tnfa->transitions[i].neg_classes);
2901       }
2902   if (tnfa->transitions)
2903     xfree(tnfa->transitions);
2904
2905   if (tnfa->initial)
2906     {
2907       for (trans = tnfa->initial; trans->state; trans++)
2908         {
2909           if (trans->tags)
2910             xfree(trans->tags);
2911         }
2912       xfree(tnfa->initial);
2913     }
2914
2915   if (tnfa->submatch_data)
2916     {
2917       for (i = 0; i < tnfa->num_submatches; i++)
2918         if (tnfa->submatch_data[i].parents)
2919           xfree(tnfa->submatch_data[i].parents);
2920       xfree(tnfa->submatch_data);
2921     }
2922
2923   if (tnfa->tag_directions)
2924     xfree(tnfa->tag_directions);
2925   if (tnfa->firstpos_chars)
2926     xfree(tnfa->firstpos_chars);
2927   if (tnfa->minimal_tags)
2928     xfree(tnfa->minimal_tags);
2929   xfree(tnfa);
2930 }