nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return REG_BADBR;
 730
 731         if (min == 0 && max == 0)
 732                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 733         else
 734                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
 735         if (!ctx->n)
 736                 return REG_ESPACE;
 737         ctx->s = s;
 738         return REG_OK;
 739 }
 740
 741 static int hexval(unsigned c)
 742 {
 743         if (c-'0'<10) return c-'0';
 744         c |= 32;
 745         if (c-'a'<6) return c-'a'+10;
 746         return -1;
 747 }
 748
 749 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 750 {
 751         if (node->submatch_id >= 0) {
 752                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 753                 if (!n)
 754                         return REG_ESPACE;
 755                 n = tre_ast_new_catenation(ctx->mem, n, node);
 756                 if (!n)
 757                         return REG_ESPACE;
 758                 n->num_submatches = node->num_submatches;
 759                 node = n;
 760         }
 761         node->submatch_id = subid;
 762         node->num_submatches++;
 763         ctx->n = node;
 764         return REG_OK;
 765 }
 766
 767 /*
 768 BRE grammar:
 769 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 770 Branch =  Atom  |  Branch Atom
 771 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 772 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 773
 774 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 775
 776 ERE grammar:
 777 Regex  =  Branch  |  Regex '|' Branch
 778 Branch =  Atom  |  Branch Atom
 779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 780 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 781
 782 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 783 */
 784
 785 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 786 {
 787         int len, ere = ctx->cflags & REG_EXTENDED;
 788         const char *p;
 789         tre_ast_node_t *node;
 790         wchar_t wc;
 791         switch (*s) {
 792         case '[':
 793                 return parse_bracket(ctx, s+1);
 794         case '\\':
 795                 p = tre_expand_macro(s+1);
 796                 if (p) {
 797                         /* assume \X expansion is a single atom */
 798                         reg_errcode_t err = parse_atom(ctx, p);
 799                         ctx->s = s+2;
 800                         return err;
 801                 }
 802                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 803                 switch (*++s) {
 804                 case 0:
 805                         return REG_EESCAPE;
 806                 case 'b':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 808                         break;
 809                 case 'B':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 811                         break;
 812                 case '<':
 813                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 814                         break;
 815                 case '>':
 816                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 817                         break;
 818                 case 'x':
 819                         s++;
 820                         int i, v = 0, c;
 821                         len = 2;
 822                         if (*s == '{') {
 823                                 len = 8;
 824                                 s++;
 825                         }
 826                         for (i=0; i<len && v<0x110000; i++) {
 827                                 c = hexval(s[i]);
 828                                 if (c < 0) break;
 829                                 v = 16*v + c;
 830                         }
 831                         s += i;
 832                         if (len == 8) {
 833                                 if (*s != '}')
 834                                         return REG_EBRACE;
 835                                 s++;
 836                         }
 837                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
 838                         s--;
 839                         break;
 840                 case '{':
 841                         /* reject repetitions after empty expression in BRE */
 842                         if (!ere)
 843                                 return REG_BADRPT;
 844                 case '|':
 845                         /* extension: treat \| as alternation in BRE */
 846                         if (!ere) {
 847                                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 848                                 s--;
 849                                 goto end;
 850                         }
 851                         /* fallthrough */
 852                 default:
 853                         if (!ere && (unsigned)*s-'1' < 9) {
 854                                 /* back reference */
 855                                 int val = *s - '0';
 856                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
 857                                 ctx->max_backref = MAX(val, ctx->max_backref);
 858                         } else {
 859                                 /* extension: accept unknown escaped char
 860                                    as a literal */
 861                                 goto parse_literal;
 862                         }
 863                 }
 864                 s++;
 865                 break;
 866         case '.':
 867                 if (ctx->cflags & REG_NEWLINE) {
 868                         tre_ast_node_t *tmp1, *tmp2;
 869                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 870                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 871                         if (tmp1 && tmp2)
 872                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 873                         else
 874                                 node = 0;
 875                 } else {
 876                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 877                 }
 878                 s++;
 879                 break;
 880         case '^':
 881                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 882                 if (!ere && s != ctx->re)
 883                         goto parse_literal;
 884                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 885                 s++;
 886                 break;
 887         case '$':
 888                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 889                 if (!ere && s[1])
 890                         goto parse_literal;
 891                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 892                 s++;
 893                 break;
 894         case '*':
 895                 return REG_BADPAT;
 896         case '{':
 897         case '+':
 898         case '?':
 899                 /* reject repetitions after empty expression in ERE */
 900                 if (ere)
 901                         return REG_BADRPT;
 902         case '|':
 903                 if (!ere)
 904                         goto parse_literal;
 905         case 0:
 906                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 907                 break;
 908         default:
 909 parse_literal:
 910                 len = mbtowc(&wc, s, -1);
 911                 if (len < 0)
 912                         return REG_BADPAT;
 913                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 914                         tre_ast_node_t *tmp1, *tmp2;
 915                         /* multiple opposite case characters are not supported */
 916                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 917                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 918                         if (tmp1 && tmp2)
 919                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 920                         else
 921                                 node = 0;
 922                 } else {
 923                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 924                 }
 925                 ctx->position++;
 926                 s += len;
 927                 break;
 928         }
 929 end:
 930         if (!node)
 931                 return REG_ESPACE;
 932         ctx->n = node;
 933         ctx->s = s;
 934         return REG_OK;
 935 }
 936
 937 #define PUSHPTR(err, s, v) do { \
 938         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 939                 return err; \
 940 } while(0)
 941
 942 #define PUSHINT(err, s, v) do { \
 943         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 944                 return err; \
 945 } while(0)
 946
 947 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 948 {
 949         tre_ast_node_t *nbranch=0, *nunion=0;
 950         int ere = ctx->cflags & REG_EXTENDED;
 951         const char *s = ctx->re;
 952         int subid = 0;
 953         int depth = 0;
 954         reg_errcode_t err;
 955         tre_stack_t *stack = ctx->stack;
 956
 957         PUSHINT(err, stack, subid++);
 958         for (;;) {
 959                 if ((!ere && *s == '\\' && s[1] == '(') ||
 960                     (ere && *s == '(')) {
 961                         PUSHPTR(err, stack, nunion);
 962                         PUSHPTR(err, stack, nbranch);
 963                         PUSHINT(err, stack, subid++);
 964                         s++;
 965                         if (!ere)
 966                                 s++;
 967                         depth++;
 968                         nbranch = nunion = 0;
 969                         continue;
 970                 }
 971                 if ((!ere && *s == '\\' && s[1] == ')') ||
 972                     (ere && *s == ')' && depth)) {
 973                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 974                         if (!ctx->n)
 975                                 return REG_ESPACE;
 976                 } else {
 977                         err = parse_atom(ctx, s);
 978                         if (err != REG_OK)
 979                                 return err;
 980                         s = ctx->s;
 981                 }
 982
 983         parse_iter:
 984                 /* extension: repetitions are rejected after an empty node
 985                    eg. (+), |*, {2}, but assertions are not treated as empty
 986                    so ^* or $? are accepted currently. */
 987                 for (;;) {
 988                         if (*s!='\\' && *s!='*') {
 989                                 if (!ere)
 990                                         break;
 991                                 if (*s!='+' && *s!='?' && *s!='{')
 992                                         break;
 993                         }
 994                         if (*s=='\\' && ere)
 995                                 break;
 996                         if (*s=='\\' && s[1]!='{')
 997                                 break;
 998                         if (*s=='\\')
 999                                 s++;
1000
1001                         /* extension: multiple consecutive *+?{,} is unspecified,
1002                            but (a+)+ has to be supported so accepting a++ makes
1003                            sense, note however that the RE_DUP_MAX limit can be
1004                            circumvented: (a{255}){255} uses a lot of memory.. */
1005                         if (*s=='{') {
1006                                 err = parse_dup(ctx, s+1);
1007                                 if (err != REG_OK)
1008                                         return err;
1009                                 s = ctx->s;
1010                         } else {
1011                                 int min=0, max=-1;
1012                                 if (*s == '+')
1013                                         min = 1;
1014                                 if (*s == '?')
1015                                         max = 1;
1016                                 s++;
1017                                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1018                                 if (!ctx->n)
1019                                         return REG_ESPACE;
1020                         }
1021                 }
1022
1023                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1024                 if ((ere && *s == '|') ||
1025                     (ere && *s == ')' && depth) ||
1026                     (!ere && *s == '\\' && s[1] == ')') ||
1027                     /* extension: treat \| as alternation in BRE */
1028                     (!ere && *s == '\\' && s[1] == '|') ||
1029                     !*s) {
1030                         /* extension: empty branch is unspecified (), (|a), (a|)
1031                            here they are not rejected but match on empty string */
1032                         int c = *s;
1033                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1034                         nbranch = 0;
1035
1036                         if (c == '\\' && s[1] == '|') {
1037                                 s+=2;
1038                         } else if (c == '|') {
1039                                 s++;
1040                         } else {
1041                                 if (c == '\\') {
1042                                         if (!depth) return REG_EPAREN;
1043                                         s+=2;
1044                                 } else if (c == ')')
1045                                         s++;
1046                                 depth--;
1047                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1048                                 if (err != REG_OK)
1049                                         return err;
1050                                 if (!c && depth<0) {
1051                                         ctx->submatch_id = subid;
1052                                         return REG_OK;
1053                                 }
1054                                 if (!c || depth<0)
1055                                         return REG_EPAREN;
1056                                 nbranch = tre_stack_pop_voidptr(stack);
1057                                 nunion = tre_stack_pop_voidptr(stack);
1058                                 goto parse_iter;
1059                         }
1060                 }
1061         }
1062 }
1063
1064
1065 /***********************************************************************
1066  from tre-compile.c
1067 ***********************************************************************/
1068
1069
1070 /*
1071   TODO:
1072    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1073      function calls.
1074 */
1075
1076 /*
1077   Algorithms to setup tags so that submatch addressing can be done.
1078 */
1079
1080
1081 /* Inserts a catenation node to the root of the tree given in `node'.
1082    As the left child a new tag with number `tag_id' to `node' is added,
1083    and the right child is the old root. */
1084 static reg_errcode_t
1085 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1086 {
1087   tre_catenation_t *c;
1088
1089   c = tre_mem_alloc(mem, sizeof(*c));
1090   if (c == NULL)
1091     return REG_ESPACE;
1092   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1093   if (c->left == NULL)
1094     return REG_ESPACE;
1095   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1096   if (c->right == NULL)
1097     return REG_ESPACE;
1098
1099   c->right->obj = node->obj;
1100   c->right->type = node->type;
1101   c->right->nullable = -1;
1102   c->right->submatch_id = -1;
1103   c->right->firstpos = NULL;
1104   c->right->lastpos = NULL;
1105   c->right->num_tags = 0;
1106   node->obj = c;
1107   node->type = CATENATION;
1108   return REG_OK;
1109 }
1110
1111 /* Inserts a catenation node to the root of the tree given in `node'.
1112    As the right child a new tag with number `tag_id' to `node' is added,
1113    and the left child is the old root. */
1114 static reg_errcode_t
1115 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1116 {
1117   tre_catenation_t *c;
1118
1119   c = tre_mem_alloc(mem, sizeof(*c));
1120   if (c == NULL)
1121     return REG_ESPACE;
1122   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1123   if (c->right == NULL)
1124     return REG_ESPACE;
1125   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1126   if (c->left == NULL)
1127     return REG_ESPACE;
1128
1129   c->left->obj = node->obj;
1130   c->left->type = node->type;
1131   c->left->nullable = -1;
1132   c->left->submatch_id = -1;
1133   c->left->firstpos = NULL;
1134   c->left->lastpos = NULL;
1135   c->left->num_tags = 0;
1136   node->obj = c;
1137   node->type = CATENATION;
1138   return REG_OK;
1139 }
1140
1141 typedef enum {
1142   ADDTAGS_RECURSE,
1143   ADDTAGS_AFTER_ITERATION,
1144   ADDTAGS_AFTER_UNION_LEFT,
1145   ADDTAGS_AFTER_UNION_RIGHT,
1146   ADDTAGS_AFTER_CAT_LEFT,
1147   ADDTAGS_AFTER_CAT_RIGHT,
1148   ADDTAGS_SET_SUBMATCH_END
1149 } tre_addtags_symbol_t;
1150
1151
1152 typedef struct {
1153   int tag;
1154   int next_tag;
1155 } tre_tag_states_t;
1156
1157
1158 /* Go through `regset' and set submatch data for submatches that are
1159    using this tag. */
1160 static void
1161 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1162 {
1163   int i;
1164
1165   for (i = 0; regset[i] >= 0; i++)
1166     {
1167       int id = regset[i] / 2;
1168       int start = !(regset[i] % 2);
1169       if (start)
1170         tnfa->submatch_data[id].so_tag = tag;
1171       else
1172         tnfa->submatch_data[id].eo_tag = tag;
1173     }
1174   regset[0] = -1;
1175 }
1176
1177
1178 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1179    subexpressions marked for submatch addressing can be traced. */
1180 static reg_errcode_t
1181 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1182              tre_tnfa_t *tnfa)
1183 {
1184   reg_errcode_t status = REG_OK;
1185   tre_addtags_symbol_t symbol;
1186   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1187   int bottom = tre_stack_num_objects(stack);
1188   /* True for first pass (counting number of needed tags) */
1189   int first_pass = (mem == NULL || tnfa == NULL);
1190   int *regset, *orig_regset;
1191   int num_tags = 0; /* Total number of tags. */
1192   int num_minimals = 0;  /* Number of special minimal tags. */
1193   int tag = 0;      /* The tag that is to be added next. */
1194   int next_tag = 1; /* Next tag to use after this one. */
1195   int *parents;     /* Stack of submatches the current submatch is
1196                        contained in. */
1197   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1198   tre_tag_states_t *saved_states;
1199
1200   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1201   if (!first_pass)
1202     {
1203       tnfa->end_tag = 0;
1204       tnfa->minimal_tags[0] = -1;
1205     }
1206
1207   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1208   if (regset == NULL)
1209     return REG_ESPACE;
1210   regset[0] = -1;
1211   orig_regset = regset;
1212
1213   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1214   if (parents == NULL)
1215     {
1216       xfree(regset);
1217       return REG_ESPACE;
1218     }
1219   parents[0] = -1;
1220
1221   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1222   if (saved_states == NULL)
1223     {
1224       xfree(regset);
1225       xfree(parents);
1226       return REG_ESPACE;
1227     }
1228   else
1229     {
1230       unsigned int i;
1231       for (i = 0; i <= tnfa->num_submatches; i++)
1232         saved_states[i].tag = -1;
1233     }
1234
1235   STACK_PUSH(stack, voidptr, node);
1236   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1237
1238   while (tre_stack_num_objects(stack) > bottom)
1239     {
1240       if (status != REG_OK)
1241         break;
1242
1243       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1244       switch (symbol)
1245         {
1246
1247         case ADDTAGS_SET_SUBMATCH_END:
1248           {
1249             int id = tre_stack_pop_int(stack);
1250             int i;
1251
1252             /* Add end of this submatch to regset. */
1253             for (i = 0; regset[i] >= 0; i++);
1254             regset[i] = id * 2 + 1;
1255             regset[i + 1] = -1;
1256
1257             /* Pop this submatch from the parents stack. */
1258             for (i = 0; parents[i] >= 0; i++);
1259             parents[i - 1] = -1;
1260             break;
1261           }
1262
1263         case ADDTAGS_RECURSE:
1264           node = tre_stack_pop_voidptr(stack);
1265
1266           if (node->submatch_id >= 0)
1267             {
1268               int id = node->submatch_id;
1269               int i;
1270
1271
1272               /* Add start of this submatch to regset. */
1273               for (i = 0; regset[i] >= 0; i++);
1274               regset[i] = id * 2;
1275               regset[i + 1] = -1;
1276
1277               if (!first_pass)
1278                 {
1279                   for (i = 0; parents[i] >= 0; i++);
1280                   tnfa->submatch_data[id].parents = NULL;
1281                   if (i > 0)
1282                     {
1283                       int *p = xmalloc(sizeof(*p) * (i + 1));
1284                       if (p == NULL)
1285                         {
1286                           status = REG_ESPACE;
1287                           break;
1288                         }
1289                       assert(tnfa->submatch_data[id].parents == NULL);
1290                       tnfa->submatch_data[id].parents = p;
1291                       for (i = 0; parents[i] >= 0; i++)
1292                         p[i] = parents[i];
1293                       p[i] = -1;
1294                     }
1295                 }
1296
1297               /* Add end of this submatch to regset after processing this
1298                  node. */
1299               STACK_PUSHX(stack, int, node->submatch_id);
1300               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1301             }
1302
1303           switch (node->type)
1304             {
1305             case LITERAL:
1306               {
1307                 tre_literal_t *lit = node->obj;
1308
1309                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1310                   {
1311                     int i;
1312                     if (regset[0] >= 0)
1313                       {
1314                         /* Regset is not empty, so add a tag before the
1315                            literal or backref. */
1316                         if (!first_pass)
1317                           {
1318                             status = tre_add_tag_left(mem, node, tag);
1319                             tnfa->tag_directions[tag] = direction;
1320                             if (minimal_tag >= 0)
1321                               {
1322                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1323                                 tnfa->minimal_tags[i] = tag;
1324                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1325                                 tnfa->minimal_tags[i + 2] = -1;
1326                                 minimal_tag = -1;
1327                                 num_minimals++;
1328                               }
1329                             tre_purge_regset(regset, tnfa, tag);
1330                           }
1331                         else
1332                           {
1333                             node->num_tags = 1;
1334                           }
1335
1336                         regset[0] = -1;
1337                         tag = next_tag;
1338                         num_tags++;
1339                         next_tag++;
1340                       }
1341                   }
1342                 else
1343                   {
1344                     assert(!IS_TAG(lit));
1345                   }
1346                 break;
1347               }
1348             case CATENATION:
1349               {
1350                 tre_catenation_t *cat = node->obj;
1351                 tre_ast_node_t *left = cat->left;
1352                 tre_ast_node_t *right = cat->right;
1353                 int reserved_tag = -1;
1354
1355
1356                 /* After processing right child. */
1357                 STACK_PUSHX(stack, voidptr, node);
1358                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1359
1360                 /* Process right child. */
1361                 STACK_PUSHX(stack, voidptr, right);
1362                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1363
1364                 /* After processing left child. */
1365                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1366                 if (left->num_tags > 0 && right->num_tags > 0)
1367                   {
1368                     /* Reserve the next tag to the right child. */
1369                     reserved_tag = next_tag;
1370                     next_tag++;
1371                   }
1372                 STACK_PUSHX(stack, int, reserved_tag);
1373                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1374
1375                 /* Process left child. */
1376                 STACK_PUSHX(stack, voidptr, left);
1377                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1378
1379                 }
1380               break;
1381             case ITERATION:
1382               {
1383                 tre_iteration_t *iter = node->obj;
1384
1385                 if (first_pass)
1386                   {
1387                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1388                   }
1389                 else
1390                   {
1391                     STACK_PUSHX(stack, int, tag);
1392                     STACK_PUSHX(stack, int, iter->minimal);
1393                   }
1394                 STACK_PUSHX(stack, voidptr, node);
1395                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1396
1397                 STACK_PUSHX(stack, voidptr, iter->arg);
1398                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1399
1400                 /* Regset is not empty, so add a tag here. */
1401                 if (regset[0] >= 0 || iter->minimal)
1402                   {
1403                     if (!first_pass)
1404                       {
1405                         int i;
1406                         status = tre_add_tag_left(mem, node, tag);
1407                         if (iter->minimal)
1408                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1409                         else
1410                           tnfa->tag_directions[tag] = direction;
1411                         if (minimal_tag >= 0)
1412                           {
1413                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1414                             tnfa->minimal_tags[i] = tag;
1415                             tnfa->minimal_tags[i + 1] = minimal_tag;
1416                             tnfa->minimal_tags[i + 2] = -1;
1417                             minimal_tag = -1;
1418                             num_minimals++;
1419                           }
1420                         tre_purge_regset(regset, tnfa, tag);
1421                       }
1422
1423                     regset[0] = -1;
1424                     tag = next_tag;
1425                     num_tags++;
1426                     next_tag++;
1427                   }
1428                 direction = TRE_TAG_MINIMIZE;
1429               }
1430               break;
1431             case UNION:
1432               {
1433                 tre_union_t *uni = node->obj;
1434                 tre_ast_node_t *left = uni->left;
1435                 tre_ast_node_t *right = uni->right;
1436                 int left_tag;
1437                 int right_tag;
1438
1439                 if (regset[0] >= 0)
1440                   {
1441                     left_tag = next_tag;
1442                     right_tag = next_tag + 1;
1443                   }
1444                 else
1445                   {
1446                     left_tag = tag;
1447                     right_tag = next_tag;
1448                   }
1449
1450                 /* After processing right child. */
1451                 STACK_PUSHX(stack, int, right_tag);
1452                 STACK_PUSHX(stack, int, left_tag);
1453                 STACK_PUSHX(stack, voidptr, regset);
1454                 STACK_PUSHX(stack, int, regset[0] >= 0);
1455                 STACK_PUSHX(stack, voidptr, node);
1456                 STACK_PUSHX(stack, voidptr, right);
1457                 STACK_PUSHX(stack, voidptr, left);
1458                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1459
1460                 /* Process right child. */
1461                 STACK_PUSHX(stack, voidptr, right);
1462                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1463
1464                 /* After processing left child. */
1465                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1466
1467                 /* Process left child. */
1468                 STACK_PUSHX(stack, voidptr, left);
1469                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1470
1471                 /* Regset is not empty, so add a tag here. */
1472                 if (regset[0] >= 0)
1473                   {
1474                     if (!first_pass)
1475                       {
1476                         int i;
1477                         status = tre_add_tag_left(mem, node, tag);
1478                         tnfa->tag_directions[tag] = direction;
1479                         if (minimal_tag >= 0)
1480                           {
1481                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1482                             tnfa->minimal_tags[i] = tag;
1483                             tnfa->minimal_tags[i + 1] = minimal_tag;
1484                             tnfa->minimal_tags[i + 2] = -1;
1485                             minimal_tag = -1;
1486                             num_minimals++;
1487                           }
1488                         tre_purge_regset(regset, tnfa, tag);
1489                       }
1490
1491                     regset[0] = -1;
1492                     tag = next_tag;
1493                     num_tags++;
1494                     next_tag++;
1495                   }
1496
1497                 if (node->num_submatches > 0)
1498                   {
1499                     /* The next two tags are reserved for markers. */
1500                     next_tag++;
1501                     tag = next_tag;
1502                     next_tag++;
1503                   }
1504
1505                 break;
1506               }
1507             }
1508
1509           if (node->submatch_id >= 0)
1510             {
1511               int i;
1512               /* Push this submatch on the parents stack. */
1513               for (i = 0; parents[i] >= 0; i++);
1514               parents[i] = node->submatch_id;
1515               parents[i + 1] = -1;
1516             }
1517
1518           break; /* end case: ADDTAGS_RECURSE */
1519
1520         case ADDTAGS_AFTER_ITERATION:
1521           {
1522             int minimal = 0;
1523             int enter_tag;
1524             node = tre_stack_pop_voidptr(stack);
1525             if (first_pass)
1526               {
1527                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1528                   + tre_stack_pop_int(stack);
1529                 minimal_tag = -1;
1530               }
1531             else
1532               {
1533                 minimal = tre_stack_pop_int(stack);
1534                 enter_tag = tre_stack_pop_int(stack);
1535                 if (minimal)
1536                   minimal_tag = enter_tag;
1537               }
1538
1539             if (!first_pass)
1540               {
1541                 if (minimal)
1542                   direction = TRE_TAG_MINIMIZE;
1543                 else
1544                   direction = TRE_TAG_MAXIMIZE;
1545               }
1546             break;
1547           }
1548
1549         case ADDTAGS_AFTER_CAT_LEFT:
1550           {
1551             int new_tag = tre_stack_pop_int(stack);
1552             next_tag = tre_stack_pop_int(stack);
1553             if (new_tag >= 0)
1554               {
1555                 tag = new_tag;
1556               }
1557             break;
1558           }
1559
1560         case ADDTAGS_AFTER_CAT_RIGHT:
1561           node = tre_stack_pop_voidptr(stack);
1562           if (first_pass)
1563             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1564               + ((tre_catenation_t *)node->obj)->right->num_tags;
1565           break;
1566
1567         case ADDTAGS_AFTER_UNION_LEFT:
1568           /* Lift the bottom of the `regset' array so that when processing
1569              the right operand the items currently in the array are
1570              invisible.  The original bottom was saved at ADDTAGS_UNION and
1571              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1572           while (*regset >= 0)
1573             regset++;
1574           break;
1575
1576         case ADDTAGS_AFTER_UNION_RIGHT:
1577           {
1578             int added_tags, tag_left, tag_right;
1579             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1580             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1581             node = tre_stack_pop_voidptr(stack);
1582             added_tags = tre_stack_pop_int(stack);
1583             if (first_pass)
1584               {
1585                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1586                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1587                   + ((node->num_submatches > 0) ? 2 : 0);
1588               }
1589             regset = tre_stack_pop_voidptr(stack);
1590             tag_left = tre_stack_pop_int(stack);
1591             tag_right = tre_stack_pop_int(stack);
1592
1593             /* Add tags after both children, the left child gets a smaller
1594                tag than the right child.  This guarantees that we prefer
1595                the left child over the right child. */
1596             /* XXX - This is not always necessary (if the children have
1597                tags which must be seen for every match of that child). */
1598             /* XXX - Check if this is the only place where tre_add_tag_right
1599                is used.  If so, use tre_add_tag_left (putting the tag before
1600                the child as opposed after the child) and throw away
1601                tre_add_tag_right. */
1602             if (node->num_submatches > 0)
1603               {
1604                 if (!first_pass)
1605                   {
1606                     status = tre_add_tag_right(mem, left, tag_left);
1607                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1608                     if (status == REG_OK)
1609                       status = tre_add_tag_right(mem, right, tag_right);
1610                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1611                   }
1612                 num_tags += 2;
1613               }
1614             direction = TRE_TAG_MAXIMIZE;
1615             break;
1616           }
1617
1618         default:
1619           assert(0);
1620           break;
1621
1622         } /* end switch(symbol) */
1623     } /* end while(tre_stack_num_objects(stack) > bottom) */
1624
1625   if (!first_pass)
1626     tre_purge_regset(regset, tnfa, tag);
1627
1628   if (!first_pass && minimal_tag >= 0)
1629     {
1630       int i;
1631       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1632       tnfa->minimal_tags[i] = tag;
1633       tnfa->minimal_tags[i + 1] = minimal_tag;
1634       tnfa->minimal_tags[i + 2] = -1;
1635       minimal_tag = -1;
1636       num_minimals++;
1637     }
1638
1639   assert(tree->num_tags == num_tags);
1640   tnfa->end_tag = num_tags;
1641   tnfa->num_tags = num_tags;
1642   tnfa->num_minimals = num_minimals;
1643   xfree(orig_regset);
1644   xfree(parents);
1645   xfree(saved_states);
1646   return status;
1647 }
1648
1649
1650
1651 /*
1652   AST to TNFA compilation routines.
1653 */
1654
1655 typedef enum {
1656   COPY_RECURSE,
1657   COPY_SET_RESULT_PTR
1658 } tre_copyast_symbol_t;
1659
1660 /* Flags for tre_copy_ast(). */
1661 #define COPY_REMOVE_TAGS         1
1662 #define COPY_MAXIMIZE_FIRST_TAG  2
1663
1664 static reg_errcode_t
1665 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1666              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1667              tre_ast_node_t **copy, int *max_pos)
1668 {
1669   reg_errcode_t status = REG_OK;
1670   int bottom = tre_stack_num_objects(stack);
1671   int num_copied = 0;
1672   int first_tag = 1;
1673   tre_ast_node_t **result = copy;
1674   tre_copyast_symbol_t symbol;
1675
1676   STACK_PUSH(stack, voidptr, ast);
1677   STACK_PUSH(stack, int, COPY_RECURSE);
1678
1679   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1680     {
1681       tre_ast_node_t *node;
1682       if (status != REG_OK)
1683         break;
1684
1685       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1686       switch (symbol)
1687         {
1688         case COPY_SET_RESULT_PTR:
1689           result = tre_stack_pop_voidptr(stack);
1690           break;
1691         case COPY_RECURSE:
1692           node = tre_stack_pop_voidptr(stack);
1693           switch (node->type)
1694             {
1695             case LITERAL:
1696               {
1697                 tre_literal_t *lit = node->obj;
1698                 int pos = lit->position;
1699                 int min = lit->code_min;
1700                 int max = lit->code_max;
1701                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1702                   {
1703                     /* XXX - e.g. [ab] has only one position but two
1704                        nodes, so we are creating holes in the state space
1705                        here.  Not fatal, just wastes memory. */
1706                     pos += *pos_add;
1707                     num_copied++;
1708                   }
1709                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1710                   {
1711                     /* Change this tag to empty. */
1712                     min = EMPTY;
1713                     max = pos = -1;
1714                   }
1715                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1716                          && first_tag)
1717                   {
1718                     /* Maximize the first tag. */
1719                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1720                     first_tag = 0;
1721                   }
1722                 *result = tre_ast_new_literal(mem, min, max, pos);
1723                 if (*result == NULL)
1724                   status = REG_ESPACE;
1725                 else {
1726                   tre_literal_t *p = (*result)->obj;
1727                   p->class = lit->class;
1728                   p->neg_classes = lit->neg_classes;
1729                 }
1730
1731                 if (pos > *max_pos)
1732                   *max_pos = pos;
1733                 break;
1734               }
1735             case UNION:
1736               {
1737                 tre_union_t *uni = node->obj;
1738                 tre_union_t *tmp;
1739                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1740                 if (*result == NULL)
1741                   {
1742                     status = REG_ESPACE;
1743                     break;
1744                   }
1745                 tmp = (*result)->obj;
1746                 result = &tmp->left;
1747                 STACK_PUSHX(stack, voidptr, uni->right);
1748                 STACK_PUSHX(stack, int, COPY_RECURSE);
1749                 STACK_PUSHX(stack, voidptr, &tmp->right);
1750                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1751                 STACK_PUSHX(stack, voidptr, uni->left);
1752                 STACK_PUSHX(stack, int, COPY_RECURSE);
1753                 break;
1754               }
1755             case CATENATION:
1756               {
1757                 tre_catenation_t *cat = node->obj;
1758                 tre_catenation_t *tmp;
1759                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1760                 if (*result == NULL)
1761                   {
1762                     status = REG_ESPACE;
1763                     break;
1764                   }
1765                 tmp = (*result)->obj;
1766                 tmp->left = NULL;
1767                 tmp->right = NULL;
1768                 result = &tmp->left;
1769
1770                 STACK_PUSHX(stack, voidptr, cat->right);
1771                 STACK_PUSHX(stack, int, COPY_RECURSE);
1772                 STACK_PUSHX(stack, voidptr, &tmp->right);
1773                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1774                 STACK_PUSHX(stack, voidptr, cat->left);
1775                 STACK_PUSHX(stack, int, COPY_RECURSE);
1776                 break;
1777               }
1778             case ITERATION:
1779               {
1780                 tre_iteration_t *iter = node->obj;
1781                 STACK_PUSHX(stack, voidptr, iter->arg);
1782                 STACK_PUSHX(stack, int, COPY_RECURSE);
1783                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1784                                            iter->max, iter->minimal);
1785                 if (*result == NULL)
1786                   {
1787                     status = REG_ESPACE;
1788                     break;
1789                   }
1790                 iter = (*result)->obj;
1791                 result = &iter->arg;
1792                 break;
1793               }
1794             default:
1795               assert(0);
1796               break;
1797             }
1798           break;
1799         }
1800     }
1801   *pos_add += num_copied;
1802   return status;
1803 }
1804
1805 typedef enum {
1806   EXPAND_RECURSE,
1807   EXPAND_AFTER_ITER
1808 } tre_expand_ast_symbol_t;
1809
1810 /* Expands each iteration node that has a finite nonzero minimum or maximum
1811    iteration count to a catenated sequence of copies of the node. */
1812 static reg_errcode_t
1813 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1814                int *position, tre_tag_direction_t *tag_directions)
1815 {
1816   reg_errcode_t status = REG_OK;
1817   int bottom = tre_stack_num_objects(stack);
1818   int pos_add = 0;
1819   int pos_add_total = 0;
1820   int max_pos = 0;
1821   int iter_depth = 0;
1822
1823   STACK_PUSHR(stack, voidptr, ast);
1824   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1825   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1826     {
1827       tre_ast_node_t *node;
1828       tre_expand_ast_symbol_t symbol;
1829
1830       if (status != REG_OK)
1831         break;
1832
1833       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1834       node = tre_stack_pop_voidptr(stack);
1835       switch (symbol)
1836         {
1837         case EXPAND_RECURSE:
1838           switch (node->type)
1839             {
1840             case LITERAL:
1841               {
1842                 tre_literal_t *lit= node->obj;
1843                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1844                   {
1845                     lit->position += pos_add;
1846                     if (lit->position > max_pos)
1847                       max_pos = lit->position;
1848                   }
1849                 break;
1850               }
1851             case UNION:
1852               {
1853                 tre_union_t *uni = node->obj;
1854                 STACK_PUSHX(stack, voidptr, uni->right);
1855                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1856                 STACK_PUSHX(stack, voidptr, uni->left);
1857                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1858                 break;
1859               }
1860             case CATENATION:
1861               {
1862                 tre_catenation_t *cat = node->obj;
1863                 STACK_PUSHX(stack, voidptr, cat->right);
1864                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1865                 STACK_PUSHX(stack, voidptr, cat->left);
1866                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1867                 break;
1868               }
1869             case ITERATION:
1870               {
1871                 tre_iteration_t *iter = node->obj;
1872                 STACK_PUSHX(stack, int, pos_add);
1873                 STACK_PUSHX(stack, voidptr, node);
1874                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1875                 STACK_PUSHX(stack, voidptr, iter->arg);
1876                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1877                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1878                    then don't increase the `pos' fields of the nodes now, it
1879                    will get done when expanding. */
1880                 if (iter->min > 1 || iter->max > 1)
1881                   pos_add = 0;
1882                 iter_depth++;
1883                 break;
1884               }
1885             default:
1886               assert(0);
1887               break;
1888             }
1889           break;
1890         case EXPAND_AFTER_ITER:
1891           {
1892             tre_iteration_t *iter = node->obj;
1893             int pos_add_last;
1894             pos_add = tre_stack_pop_int(stack);
1895             pos_add_last = pos_add;
1896             if (iter->min > 1 || iter->max > 1)
1897               {
1898                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1899                 int j;
1900                 int pos_add_save = pos_add;
1901
1902                 /* Create a catenated sequence of copies of the node. */
1903                 for (j = 0; j < iter->min; j++)
1904                   {
1905                     tre_ast_node_t *copy;
1906                     /* Remove tags from all but the last copy. */
1907                     int flags = ((j + 1 < iter->min)
1908                                  ? COPY_REMOVE_TAGS
1909                                  : COPY_MAXIMIZE_FIRST_TAG);
1910                     pos_add_save = pos_add;
1911                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1912                                           &pos_add, tag_directions, &copy,
1913                                           &max_pos);
1914                     if (status != REG_OK)
1915                       return status;
1916                     if (seq1 != NULL)
1917                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1918                     else
1919                       seq1 = copy;
1920                     if (seq1 == NULL)
1921                       return REG_ESPACE;
1922                   }
1923
1924                 if (iter->max == -1)
1925                   {
1926                     /* No upper limit. */
1927                     pos_add_save = pos_add;
1928                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1929                                           &pos_add, NULL, &seq2, &max_pos);
1930                     if (status != REG_OK)
1931                       return status;
1932                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1933                     if (seq2 == NULL)
1934                       return REG_ESPACE;
1935                   }
1936                 else
1937                   {
1938                     for (j = iter->min; j < iter->max; j++)
1939                       {
1940                         tre_ast_node_t *tmp, *copy;
1941                         pos_add_save = pos_add;
1942                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1943                                               &pos_add, NULL, &copy, &max_pos);
1944                         if (status != REG_OK)
1945                           return status;
1946                         if (seq2 != NULL)
1947                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1948                         else
1949                           seq2 = copy;
1950                         if (seq2 == NULL)
1951                           return REG_ESPACE;
1952                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1953                         if (tmp == NULL)
1954                           return REG_ESPACE;
1955                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1956                         if (seq2 == NULL)
1957                           return REG_ESPACE;
1958                       }
1959                   }
1960
1961                 pos_add = pos_add_save;
1962                 if (seq1 == NULL)
1963                   seq1 = seq2;
1964                 else if (seq2 != NULL)
1965                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1966                 if (seq1 == NULL)
1967                   return REG_ESPACE;
1968                 node->obj = seq1->obj;
1969                 node->type = seq1->type;
1970               }
1971
1972             iter_depth--;
1973             pos_add_total += pos_add - pos_add_last;
1974             if (iter_depth == 0)
1975               pos_add = pos_add_total;
1976
1977             break;
1978           }
1979         default:
1980           assert(0);
1981           break;
1982         }
1983     }
1984
1985   *position += pos_add_total;
1986
1987   /* `max_pos' should never be larger than `*position' if the above
1988      code works, but just an extra safeguard let's make sure
1989      `*position' is set large enough so enough memory will be
1990      allocated for the transition table. */
1991   if (max_pos > *position)
1992     *position = max_pos;
1993
1994   return status;
1995 }
1996
1997 static tre_pos_and_tags_t *
1998 tre_set_empty(tre_mem_t mem)
1999 {
2000   tre_pos_and_tags_t *new_set;
2001
2002   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2003   if (new_set == NULL)
2004     return NULL;
2005
2006   new_set[0].position = -1;
2007   new_set[0].code_min = -1;
2008   new_set[0].code_max = -1;
2009
2010   return new_set;
2011 }
2012
2013 static tre_pos_and_tags_t *
2014 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2015             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2016 {
2017   tre_pos_and_tags_t *new_set;
2018
2019   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2020   if (new_set == NULL)
2021     return NULL;
2022
2023   new_set[0].position = position;
2024   new_set[0].code_min = code_min;
2025   new_set[0].code_max = code_max;
2026   new_set[0].class = class;
2027   new_set[0].neg_classes = neg_classes;
2028   new_set[0].backref = backref;
2029   new_set[1].position = -1;
2030   new_set[1].code_min = -1;
2031   new_set[1].code_max = -1;
2032
2033   return new_set;
2034 }
2035
2036 static tre_pos_and_tags_t *
2037 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2038               int *tags, int assertions)
2039 {
2040   int s1, s2, i, j;
2041   tre_pos_and_tags_t *new_set;
2042   int *new_tags;
2043   int num_tags;
2044
2045   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2046   for (s1 = 0; set1[s1].position >= 0; s1++);
2047   for (s2 = 0; set2[s2].position >= 0; s2++);
2048   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2049   if (!new_set )
2050     return NULL;
2051
2052   for (s1 = 0; set1[s1].position >= 0; s1++)
2053     {
2054       new_set[s1].position = set1[s1].position;
2055       new_set[s1].code_min = set1[s1].code_min;
2056       new_set[s1].code_max = set1[s1].code_max;
2057       new_set[s1].assertions = set1[s1].assertions | assertions;
2058       new_set[s1].class = set1[s1].class;
2059       new_set[s1].neg_classes = set1[s1].neg_classes;
2060       new_set[s1].backref = set1[s1].backref;
2061       if (set1[s1].tags == NULL && tags == NULL)
2062         new_set[s1].tags = NULL;
2063       else
2064         {
2065           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2066           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2067                                          * (i + num_tags + 1)));
2068           if (new_tags == NULL)
2069             return NULL;
2070           for (j = 0; j < i; j++)
2071             new_tags[j] = set1[s1].tags[j];
2072           for (i = 0; i < num_tags; i++)
2073             new_tags[j + i] = tags[i];
2074           new_tags[j + i] = -1;
2075           new_set[s1].tags = new_tags;
2076         }
2077     }
2078
2079   for (s2 = 0; set2[s2].position >= 0; s2++)
2080     {
2081       new_set[s1 + s2].position = set2[s2].position;
2082       new_set[s1 + s2].code_min = set2[s2].code_min;
2083       new_set[s1 + s2].code_max = set2[s2].code_max;
2084       /* XXX - why not | assertions here as well? */
2085       new_set[s1 + s2].assertions = set2[s2].assertions;
2086       new_set[s1 + s2].class = set2[s2].class;
2087       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2088       new_set[s1 + s2].backref = set2[s2].backref;
2089       if (set2[s2].tags == NULL)
2090         new_set[s1 + s2].tags = NULL;
2091       else
2092         {
2093           for (i = 0; set2[s2].tags[i] >= 0; i++);
2094           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2095           if (new_tags == NULL)
2096             return NULL;
2097           for (j = 0; j < i; j++)
2098             new_tags[j] = set2[s2].tags[j];
2099           new_tags[j] = -1;
2100           new_set[s1 + s2].tags = new_tags;
2101         }
2102     }
2103   new_set[s1 + s2].position = -1;
2104   return new_set;
2105 }
2106
2107 /* Finds the empty path through `node' which is the one that should be
2108    taken according to POSIX.2 rules, and adds the tags on that path to
2109    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2110    set to the number of tags seen on the path. */
2111 static reg_errcode_t
2112 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2113                 int *assertions, int *num_tags_seen)
2114 {
2115   tre_literal_t *lit;
2116   tre_union_t *uni;
2117   tre_catenation_t *cat;
2118   tre_iteration_t *iter;
2119   int i;
2120   int bottom = tre_stack_num_objects(stack);
2121   reg_errcode_t status = REG_OK;
2122   if (num_tags_seen)
2123     *num_tags_seen = 0;
2124
2125   status = tre_stack_push_voidptr(stack, node);
2126
2127   /* Walk through the tree recursively. */
2128   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2129     {
2130       node = tre_stack_pop_voidptr(stack);
2131
2132       switch (node->type)
2133         {
2134         case LITERAL:
2135           lit = (tre_literal_t *)node->obj;
2136           switch (lit->code_min)
2137             {
2138             case TAG:
2139               if (lit->code_max >= 0)
2140                 {
2141                   if (tags != NULL)
2142                     {
2143                       /* Add the tag to `tags'. */
2144                       for (i = 0; tags[i] >= 0; i++)
2145                         if (tags[i] == lit->code_max)
2146                           break;
2147                       if (tags[i] < 0)
2148                         {
2149                           tags[i] = lit->code_max;
2150                           tags[i + 1] = -1;
2151                         }
2152                     }
2153                   if (num_tags_seen)
2154                     (*num_tags_seen)++;
2155                 }
2156               break;
2157             case ASSERTION:
2158               assert(lit->code_max >= 1
2159                      || lit->code_max <= ASSERT_LAST);
2160               if (assertions != NULL)
2161                 *assertions |= lit->code_max;
2162               break;
2163             case EMPTY:
2164               break;
2165             default:
2166               assert(0);
2167               break;
2168             }
2169           break;
2170
2171         case UNION:
2172           /* Subexpressions starting earlier take priority over ones
2173              starting later, so we prefer the left subexpression over the
2174              right subexpression. */
2175           uni = (tre_union_t *)node->obj;
2176           if (uni->left->nullable)
2177             STACK_PUSHX(stack, voidptr, uni->left)
2178           else if (uni->right->nullable)
2179             STACK_PUSHX(stack, voidptr, uni->right)
2180           else
2181             assert(0);
2182           break;
2183
2184         case CATENATION:
2185           /* The path must go through both children. */
2186           cat = (tre_catenation_t *)node->obj;
2187           assert(cat->left->nullable);
2188           assert(cat->right->nullable);
2189           STACK_PUSHX(stack, voidptr, cat->left);
2190           STACK_PUSHX(stack, voidptr, cat->right);
2191           break;
2192
2193         case ITERATION:
2194           /* A match with an empty string is preferred over no match at
2195              all, so we go through the argument if possible. */
2196           iter = (tre_iteration_t *)node->obj;
2197           if (iter->arg->nullable)
2198             STACK_PUSHX(stack, voidptr, iter->arg);
2199           break;
2200
2201         default:
2202           assert(0);
2203           break;
2204         }
2205     }
2206
2207   return status;
2208 }
2209
2210
2211 typedef enum {
2212   NFL_RECURSE,
2213   NFL_POST_UNION,
2214   NFL_POST_CATENATION,
2215   NFL_POST_ITERATION
2216 } tre_nfl_stack_symbol_t;
2217
2218
2219 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2220    the nodes of the AST `tree'. */
2221 static reg_errcode_t
2222 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2223 {
2224   int bottom = tre_stack_num_objects(stack);
2225
2226   STACK_PUSHR(stack, voidptr, tree);
2227   STACK_PUSHR(stack, int, NFL_RECURSE);
2228
2229   while (tre_stack_num_objects(stack) > bottom)
2230     {
2231       tre_nfl_stack_symbol_t symbol;
2232       tre_ast_node_t *node;
2233
2234       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2235       node = tre_stack_pop_voidptr(stack);
2236       switch (symbol)
2237         {
2238         case NFL_RECURSE:
2239           switch (node->type)
2240             {
2241             case LITERAL:
2242               {
2243                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2244                 if (IS_BACKREF(lit))
2245                   {
2246                     /* Back references: nullable = false, firstpos = {i},
2247                        lastpos = {i}. */
2248                     node->nullable = 0;
2249                     node->firstpos = tre_set_one(mem, lit->position, 0,
2250                                              TRE_CHAR_MAX, 0, NULL, -1);
2251                     if (!node->firstpos)
2252                       return REG_ESPACE;
2253                     node->lastpos = tre_set_one(mem, lit->position, 0,
2254                                                 TRE_CHAR_MAX, 0, NULL,
2255                                                 (int)lit->code_max);
2256                     if (!node->lastpos)
2257                       return REG_ESPACE;
2258                   }
2259                 else if (lit->code_min < 0)
2260                   {
2261                     /* Tags, empty strings, params, and zero width assertions:
2262                        nullable = true, firstpos = {}, and lastpos = {}. */
2263                     node->nullable = 1;
2264                     node->firstpos = tre_set_empty(mem);
2265                     if (!node->firstpos)
2266                       return REG_ESPACE;
2267                     node->lastpos = tre_set_empty(mem);
2268                     if (!node->lastpos)
2269                       return REG_ESPACE;
2270                   }
2271                 else
2272                   {
2273                     /* Literal at position i: nullable = false, firstpos = {i},
2274                        lastpos = {i}. */
2275                     node->nullable = 0;
2276                     node->firstpos =
2277                       tre_set_one(mem, lit->position, (int)lit->code_min,
2278                                   (int)lit->code_max, 0, NULL, -1);
2279                     if (!node->firstpos)
2280                       return REG_ESPACE;
2281                     node->lastpos = tre_set_one(mem, lit->position,
2282                                                 (int)lit->code_min,
2283                                                 (int)lit->code_max,
2284                                                 lit->class, lit->neg_classes,
2285                                                 -1);
2286                     if (!node->lastpos)
2287                       return REG_ESPACE;
2288                   }
2289                 break;
2290               }
2291
2292             case UNION:
2293               /* Compute the attributes for the two subtrees, and after that
2294                  for this node. */
2295               STACK_PUSHR(stack, voidptr, node);
2296               STACK_PUSHR(stack, int, NFL_POST_UNION);
2297               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2298               STACK_PUSHR(stack, int, NFL_RECURSE);
2299               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2300               STACK_PUSHR(stack, int, NFL_RECURSE);
2301               break;
2302
2303             case CATENATION:
2304               /* Compute the attributes for the two subtrees, and after that
2305                  for this node. */
2306               STACK_PUSHR(stack, voidptr, node);
2307               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2308               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2309               STACK_PUSHR(stack, int, NFL_RECURSE);
2310               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2311               STACK_PUSHR(stack, int, NFL_RECURSE);
2312               break;
2313
2314             case ITERATION:
2315               /* Compute the attributes for the subtree, and after that for
2316                  this node. */
2317               STACK_PUSHR(stack, voidptr, node);
2318               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2319               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2320               STACK_PUSHR(stack, int, NFL_RECURSE);
2321               break;
2322             }
2323           break; /* end case: NFL_RECURSE */
2324
2325         case NFL_POST_UNION:
2326           {
2327             tre_union_t *uni = (tre_union_t *)node->obj;
2328             node->nullable = uni->left->nullable || uni->right->nullable;
2329             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2330                                            uni->right->firstpos, NULL, 0);
2331             if (!node->firstpos)
2332               return REG_ESPACE;
2333             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2334                                           uni->right->lastpos, NULL, 0);
2335             if (!node->lastpos)
2336               return REG_ESPACE;
2337             break;
2338           }
2339
2340         case NFL_POST_ITERATION:
2341           {
2342             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2343
2344             if (iter->min == 0 || iter->arg->nullable)
2345               node->nullable = 1;
2346             else
2347               node->nullable = 0;
2348             node->firstpos = iter->arg->firstpos;
2349             node->lastpos = iter->arg->lastpos;
2350             break;
2351           }
2352
2353         case NFL_POST_CATENATION:
2354           {
2355             int num_tags, *tags, assertions;
2356             reg_errcode_t status;
2357             tre_catenation_t *cat = node->obj;
2358             node->nullable = cat->left->nullable && cat->right->nullable;
2359
2360             /* Compute firstpos. */
2361             if (cat->left->nullable)
2362               {
2363                 /* The left side matches the empty string.  Make a first pass
2364                    with tre_match_empty() to get the number of tags and
2365                    parameters. */
2366                 status = tre_match_empty(stack, cat->left,
2367                                          NULL, NULL, &num_tags);
2368                 if (status != REG_OK)
2369                   return status;
2370                 /* Allocate arrays for the tags and parameters. */
2371                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2372                 if (!tags)
2373                   return REG_ESPACE;
2374                 tags[0] = -1;
2375                 assertions = 0;
2376                 /* Second pass with tre_mach_empty() to get the list of
2377                    tags and parameters. */
2378                 status = tre_match_empty(stack, cat->left, tags,
2379                                          &assertions, NULL);
2380                 if (status != REG_OK)
2381                   {
2382                     xfree(tags);
2383                     return status;
2384                   }
2385                 node->firstpos =
2386                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2387                                 tags, assertions);
2388                 xfree(tags);
2389                 if (!node->firstpos)
2390                   return REG_ESPACE;
2391               }
2392             else
2393               {
2394                 node->firstpos = cat->left->firstpos;
2395               }
2396
2397             /* Compute lastpos. */
2398             if (cat->right->nullable)
2399               {
2400                 /* The right side matches the empty string.  Make a first pass
2401                    with tre_match_empty() to get the number of tags and
2402                    parameters. */
2403                 status = tre_match_empty(stack, cat->right,
2404                                          NULL, NULL, &num_tags);
2405                 if (status != REG_OK)
2406                   return status;
2407                 /* Allocate arrays for the tags and parameters. */
2408                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2409                 if (!tags)
2410                   return REG_ESPACE;
2411                 tags[0] = -1;
2412                 assertions = 0;
2413                 /* Second pass with tre_mach_empty() to get the list of
2414                    tags and parameters. */
2415                 status = tre_match_empty(stack, cat->right, tags,
2416                                          &assertions, NULL);
2417                 if (status != REG_OK)
2418                   {
2419                     xfree(tags);
2420                     return status;
2421                   }
2422                 node->lastpos =
2423                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2424                                 tags, assertions);
2425                 xfree(tags);
2426                 if (!node->lastpos)
2427                   return REG_ESPACE;
2428               }
2429             else
2430               {
2431                 node->lastpos = cat->right->lastpos;
2432               }
2433             break;
2434           }
2435
2436         default:
2437           assert(0);
2438           break;
2439         }
2440     }
2441
2442   return REG_OK;
2443 }
2444
2445
2446 /* Adds a transition from each position in `p1' to each position in `p2'. */
2447 static reg_errcode_t
2448 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2449                tre_tnfa_transition_t *transitions,
2450                int *counts, int *offs)
2451 {
2452   tre_pos_and_tags_t *orig_p2 = p2;
2453   tre_tnfa_transition_t *trans;
2454   int i, j, k, l, dup, prev_p2_pos;
2455
2456   if (transitions != NULL)
2457     while (p1->position >= 0)
2458       {
2459         p2 = orig_p2;
2460         prev_p2_pos = -1;
2461         while (p2->position >= 0)
2462           {
2463             /* Optimization: if this position was already handled, skip it. */
2464             if (p2->position == prev_p2_pos)
2465               {
2466                 p2++;
2467                 continue;
2468               }
2469             prev_p2_pos = p2->position;
2470             /* Set `trans' to point to the next unused transition from
2471                position `p1->position'. */
2472             trans = transitions + offs[p1->position];
2473             while (trans->state != NULL)
2474               {
2475 #if 0
2476                 /* If we find a previous transition from `p1->position' to
2477                    `p2->position', it is overwritten.  This can happen only
2478                    if there are nested loops in the regexp, like in "((a)*)*".
2479                    In POSIX.2 repetition using the outer loop is always
2480                    preferred over using the inner loop.  Therefore the
2481                    transition for the inner loop is useless and can be thrown
2482                    away. */
2483                 /* XXX - The same position is used for all nodes in a bracket
2484                    expression, so this optimization cannot be used (it will
2485                    break bracket expressions) unless I figure out a way to
2486                    detect it here. */
2487                 if (trans->state_id == p2->position)
2488                   {
2489                     break;
2490                   }
2491 #endif
2492                 trans++;
2493               }
2494
2495             if (trans->state == NULL)
2496               (trans + 1)->state = NULL;
2497             /* Use the character ranges, assertions, etc. from `p1' for
2498                the transition from `p1' to `p2'. */
2499             trans->code_min = p1->code_min;
2500             trans->code_max = p1->code_max;
2501             trans->state = transitions + offs[p2->position];
2502             trans->state_id = p2->position;
2503             trans->assertions = p1->assertions | p2->assertions
2504               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2505               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2506             if (p1->backref >= 0)
2507               {
2508                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2509                 assert(p2->backref < 0);
2510                 trans->u.backref = p1->backref;
2511                 trans->assertions |= ASSERT_BACKREF;
2512               }
2513             else
2514               trans->u.class = p1->class;
2515             if (p1->neg_classes != NULL)
2516               {
2517                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2518                 trans->neg_classes =
2519                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2520                 if (trans->neg_classes == NULL)
2521                   return REG_ESPACE;
2522                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2523                   trans->neg_classes[i] = p1->neg_classes[i];
2524                 trans->neg_classes[i] = (tre_ctype_t)0;
2525               }
2526             else
2527               trans->neg_classes = NULL;
2528
2529             /* Find out how many tags this transition has. */
2530             i = 0;
2531             if (p1->tags != NULL)
2532               while(p1->tags[i] >= 0)
2533                 i++;
2534             j = 0;
2535             if (p2->tags != NULL)
2536               while(p2->tags[j] >= 0)
2537                 j++;
2538
2539             /* If we are overwriting a transition, free the old tag array. */
2540             if (trans->tags != NULL)
2541               xfree(trans->tags);
2542             trans->tags = NULL;
2543
2544             /* If there were any tags, allocate an array and fill it. */
2545             if (i + j > 0)
2546               {
2547                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2548                 if (!trans->tags)
2549                   return REG_ESPACE;
2550                 i = 0;
2551                 if (p1->tags != NULL)
2552                   while(p1->tags[i] >= 0)
2553                     {
2554                       trans->tags[i] = p1->tags[i];
2555                       i++;
2556                     }
2557                 l = i;
2558                 j = 0;
2559                 if (p2->tags != NULL)
2560                   while (p2->tags[j] >= 0)
2561                     {
2562                       /* Don't add duplicates. */
2563                       dup = 0;
2564                       for (k = 0; k < i; k++)
2565                         if (trans->tags[k] == p2->tags[j])
2566                           {
2567                             dup = 1;
2568                             break;
2569                           }
2570                       if (!dup)
2571                         trans->tags[l++] = p2->tags[j];
2572                       j++;
2573                     }
2574                 trans->tags[l] = -1;
2575               }
2576
2577             p2++;
2578           }
2579         p1++;
2580       }
2581   else
2582     /* Compute a maximum limit for the number of transitions leaving
2583        from each state. */
2584     while (p1->position >= 0)
2585       {
2586         p2 = orig_p2;
2587         while (p2->position >= 0)
2588           {
2589             counts[p1->position]++;
2590             p2++;
2591           }
2592         p1++;
2593       }
2594   return REG_OK;
2595 }
2596
2597 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2598    labelled with one character range (there are no transitions on empty
2599    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2600    the regexp. */
2601 static reg_errcode_t
2602 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2603                 int *counts, int *offs)
2604 {
2605   tre_union_t *uni;
2606   tre_catenation_t *cat;
2607   tre_iteration_t *iter;
2608   reg_errcode_t errcode = REG_OK;
2609
2610   /* XXX - recurse using a stack!. */
2611   switch (node->type)
2612     {
2613     case LITERAL:
2614       break;
2615     case UNION:
2616       uni = (tre_union_t *)node->obj;
2617       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2618       if (errcode != REG_OK)
2619         return errcode;
2620       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2621       break;
2622
2623     case CATENATION:
2624       cat = (tre_catenation_t *)node->obj;
2625       /* Add a transition from each position in cat->left->lastpos
2626          to each position in cat->right->firstpos. */
2627       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2628                                transitions, counts, offs);
2629       if (errcode != REG_OK)
2630         return errcode;
2631       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2632       if (errcode != REG_OK)
2633         return errcode;
2634       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2635       break;
2636
2637     case ITERATION:
2638       iter = (tre_iteration_t *)node->obj;
2639       assert(iter->max == -1 || iter->max == 1);
2640
2641       if (iter->max == -1)
2642         {
2643           assert(iter->min == 0 || iter->min == 1);
2644           /* Add a transition from each last position in the iterated
2645              expression to each first position. */
2646           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2647                                    transitions, counts, offs);
2648           if (errcode != REG_OK)
2649             return errcode;
2650         }
2651       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2652       break;
2653     }
2654   return errcode;
2655 }
2656
2657
2658 #define ERROR_EXIT(err)           \
2659   do                              \
2660     {                             \
2661       errcode = err;              \
2662       if (/*CONSTCOND*/1)         \
2663         goto error_exit;          \
2664     }                             \
2665  while (/*CONSTCOND*/0)
2666
2667
2668 int
2669 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2670 {
2671   tre_stack_t *stack;
2672   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2673   tre_pos_and_tags_t *p;
2674   int *counts = NULL, *offs = NULL;
2675   int i, add = 0;
2676   tre_tnfa_transition_t *transitions, *initial;
2677   tre_tnfa_t *tnfa = NULL;
2678   tre_submatch_data_t *submatch_data;
2679   tre_tag_direction_t *tag_directions = NULL;
2680   reg_errcode_t errcode;
2681   tre_mem_t mem;
2682
2683   /* Parse context. */
2684   tre_parse_ctx_t parse_ctx;
2685
2686   /* Allocate a stack used throughout the compilation process for various
2687      purposes. */
2688   stack = tre_stack_new(512, 10240, 128);
2689   if (!stack)
2690     return REG_ESPACE;
2691   /* Allocate a fast memory allocator. */
2692   mem = tre_mem_new();
2693   if (!mem)
2694     {
2695       tre_stack_destroy(stack);
2696       return REG_ESPACE;
2697     }
2698
2699   /* Parse the regexp. */
2700   memset(&parse_ctx, 0, sizeof(parse_ctx));
2701   parse_ctx.mem = mem;
2702   parse_ctx.stack = stack;
2703   parse_ctx.re = regex;
2704   parse_ctx.cflags = cflags;
2705   parse_ctx.max_backref = -1;
2706   errcode = tre_parse(&parse_ctx);
2707   if (errcode != REG_OK)
2708     ERROR_EXIT(errcode);
2709   preg->re_nsub = parse_ctx.submatch_id - 1;
2710   tree = parse_ctx.n;
2711
2712 #ifdef TRE_DEBUG
2713   tre_ast_print(tree);
2714 #endif /* TRE_DEBUG */
2715
2716   /* Referring to nonexistent subexpressions is illegal. */
2717   if (parse_ctx.max_backref > (int)preg->re_nsub)
2718     ERROR_EXIT(REG_ESUBREG);
2719
2720   /* Allocate the TNFA struct. */
2721   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2722   if (tnfa == NULL)
2723     ERROR_EXIT(REG_ESPACE);
2724   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2725   tnfa->have_approx = 0;
2726   tnfa->num_submatches = parse_ctx.submatch_id;
2727
2728   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2729      regexp does not have back references, this can be skipped. */
2730   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2731     {
2732
2733       /* Figure out how many tags we will need. */
2734       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2735       if (errcode != REG_OK)
2736         ERROR_EXIT(errcode);
2737
2738       if (tnfa->num_tags > 0)
2739         {
2740           tag_directions = xmalloc(sizeof(*tag_directions)
2741                                    * (tnfa->num_tags + 1));
2742           if (tag_directions == NULL)
2743             ERROR_EXIT(REG_ESPACE);
2744           tnfa->tag_directions = tag_directions;
2745           memset(tag_directions, -1,
2746                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2747         }
2748       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2749                                    sizeof(*tnfa->minimal_tags));
2750       if (tnfa->minimal_tags == NULL)
2751         ERROR_EXIT(REG_ESPACE);
2752
2753       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2754                               sizeof(*submatch_data));
2755       if (submatch_data == NULL)
2756         ERROR_EXIT(REG_ESPACE);
2757       tnfa->submatch_data = submatch_data;
2758
2759       errcode = tre_add_tags(mem, stack, tree, tnfa);
2760       if (errcode != REG_OK)
2761         ERROR_EXIT(errcode);
2762
2763     }
2764
2765   /* Expand iteration nodes. */
2766   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2767                            tag_directions);
2768   if (errcode != REG_OK)
2769     ERROR_EXIT(errcode);
2770
2771   /* Add a dummy node for the final state.
2772      XXX - For certain patterns this dummy node can be optimized away,
2773            for example "a*" or "ab*".   Figure out a simple way to detect
2774            this possibility. */
2775   tmp_ast_l = tree;
2776   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2777   if (tmp_ast_r == NULL)
2778     ERROR_EXIT(REG_ESPACE);
2779
2780   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2781   if (tree == NULL)
2782     ERROR_EXIT(REG_ESPACE);
2783
2784   errcode = tre_compute_nfl(mem, stack, tree);
2785   if (errcode != REG_OK)
2786     ERROR_EXIT(errcode);
2787
2788   counts = xmalloc(sizeof(int) * parse_ctx.position);
2789   if (counts == NULL)
2790     ERROR_EXIT(REG_ESPACE);
2791
2792   offs = xmalloc(sizeof(int) * parse_ctx.position);
2793   if (offs == NULL)
2794     ERROR_EXIT(REG_ESPACE);
2795
2796   for (i = 0; i < parse_ctx.position; i++)
2797     counts[i] = 0;
2798   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2799
2800   add = 0;
2801   for (i = 0; i < parse_ctx.position; i++)
2802     {
2803       offs[i] = add;
2804       add += counts[i] + 1;
2805       counts[i] = 0;
2806     }
2807   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2808   if (transitions == NULL)
2809     ERROR_EXIT(REG_ESPACE);
2810   tnfa->transitions = transitions;
2811   tnfa->num_transitions = add;
2812
2813   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2814   if (errcode != REG_OK)
2815     ERROR_EXIT(errcode);
2816
2817   tnfa->firstpos_chars = NULL;
2818
2819   p = tree->firstpos;
2820   i = 0;
2821   while (p->position >= 0)
2822     {
2823       i++;
2824       p++;
2825     }
2826
2827   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2828   if (initial == NULL)
2829     ERROR_EXIT(REG_ESPACE);
2830   tnfa->initial = initial;
2831
2832   i = 0;
2833   for (p = tree->firstpos; p->position >= 0; p++)
2834     {
2835       initial[i].state = transitions + offs[p->position];
2836       initial[i].state_id = p->position;
2837       initial[i].tags = NULL;
2838       /* Copy the arrays p->tags, and p->params, they are allocated
2839          from a tre_mem object. */
2840       if (p->tags)
2841         {
2842           int j;
2843           for (j = 0; p->tags[j] >= 0; j++);
2844           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2845           if (!initial[i].tags)
2846             ERROR_EXIT(REG_ESPACE);
2847           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2848         }
2849       initial[i].assertions = p->assertions;
2850       i++;
2851     }
2852   initial[i].state = NULL;
2853
2854   tnfa->num_transitions = add;
2855   tnfa->final = transitions + offs[tree->lastpos[0].position];
2856   tnfa->num_states = parse_ctx.position;
2857   tnfa->cflags = cflags;
2858
2859   tre_mem_destroy(mem);
2860   tre_stack_destroy(stack);
2861   xfree(counts);
2862   xfree(offs);
2863
2864   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2865   return REG_OK;
2866
2867  error_exit:
2868   /* Free everything that was allocated and return the error code. */
2869   tre_mem_destroy(mem);
2870   if (stack != NULL)
2871     tre_stack_destroy(stack);
2872   if (counts != NULL)
2873     xfree(counts);
2874   if (offs != NULL)
2875     xfree(offs);
2876   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2877   regfree(preg);
2878   return errcode;
2879 }
2880
2881
2882
2883
2884 void
2885 regfree(regex_t *preg)
2886 {
2887   tre_tnfa_t *tnfa;
2888   unsigned int i;
2889   tre_tnfa_transition_t *trans;
2890
2891   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2892   if (!tnfa)
2893     return;
2894
2895   for (i = 0; i < tnfa->num_transitions; i++)
2896     if (tnfa->transitions[i].state)
2897       {
2898         if (tnfa->transitions[i].tags)
2899           xfree(tnfa->transitions[i].tags);
2900         if (tnfa->transitions[i].neg_classes)
2901           xfree(tnfa->transitions[i].neg_classes);
2902       }
2903   if (tnfa->transitions)
2904     xfree(tnfa->transitions);
2905
2906   if (tnfa->initial)
2907     {
2908       for (trans = tnfa->initial; trans->state; trans++)
2909         {
2910           if (trans->tags)
2911             xfree(trans->tags);
2912         }
2913       xfree(tnfa->initial);
2914     }
2915
2916   if (tnfa->submatch_data)
2917     {
2918       for (i = 0; i < tnfa->num_submatches; i++)
2919         if (tnfa->submatch_data[i].parents)
2920           xfree(tnfa->submatch_data[i].parents);
2921       xfree(tnfa->submatch_data);
2922     }
2923
2924   if (tnfa->tag_directions)
2925     xfree(tnfa->tag_directions);
2926   if (tnfa->firstpos_chars)
2927     xfree(tnfa->firstpos_chars);
2928   if (tnfa->minimal_tags)
2929     xfree(tnfa->minimal_tags);
2930   xfree(tnfa);
2931 }