nsz Git - musl/blob - src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37
  38 #include "tre.h"
  39
  40 #include <assert.h>
  41
  42 /***********************************************************************
  43  from tre-compile.h
  44 ***********************************************************************/
  45
  46 typedef struct {
  47   int position;
  48   int code_min;
  49   int code_max;
  50   int *tags;
  51   int assertions;
  52   tre_ctype_t class;
  53   tre_ctype_t *neg_classes;
  54   int backref;
  55 } tre_pos_and_tags_t;
  56
  57
  58 /***********************************************************************
  59  from tre-ast.c and tre-ast.h
  60 ***********************************************************************/
  61
  62 /* The different AST node types. */
  63 typedef enum {
  64   LITERAL,
  65   CATENATION,
  66   ITERATION,
  67   UNION
  68 } tre_ast_type_t;
  69
  70 /* Special subtypes of TRE_LITERAL. */
  71 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  72 #define ASSERTION -2   /* Assertion leaf. */
  73 #define TAG       -3   /* Tag leaf. */
  74 #define BACKREF   -4   /* Back reference leaf. */
  75
  76 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  77 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  78 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  79 #define IS_TAG(x)       ((x)->code_min == TAG)
  80 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  81
  82
  83 /* A generic AST node.  All AST nodes consist of this node on the top
  84    level with `obj' pointing to the actual content. */
  85 typedef struct {
  86   tre_ast_type_t type;   /* Type of the node. */
  87   void *obj;             /* Pointer to actual node. */
  88   int nullable;
  89   int submatch_id;
  90   int num_submatches;
  91   int num_tags;
  92   tre_pos_and_tags_t *firstpos;
  93   tre_pos_and_tags_t *lastpos;
  94 } tre_ast_node_t;
  95
  96
  97 /* A "literal" node.  These are created for assertions, back references,
  98    tags, matching parameter settings, and all expressions that match one
  99    character. */
 100 typedef struct {
 101   long code_min;
 102   long code_max;
 103   int position;
 104   tre_ctype_t class;
 105   tre_ctype_t *neg_classes;
 106 } tre_literal_t;
 107
 108 /* A "catenation" node.  These are created when two regexps are concatenated.
 109    If there are more than one subexpressions in sequence, the `left' part
 110    holds all but the last, and `right' part holds the last subexpression
 111    (catenation is left associative). */
 112 typedef struct {
 113   tre_ast_node_t *left;
 114   tre_ast_node_t *right;
 115 } tre_catenation_t;
 116
 117 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 118    operators. */
 119 typedef struct {
 120   /* Subexpression to match. */
 121   tre_ast_node_t *arg;
 122   /* Minimum number of consecutive matches. */
 123   int min;
 124   /* Maximum number of consecutive matches. */
 125   int max;
 126   /* If 0, match as many characters as possible, if 1 match as few as
 127      possible.  Note that this does not always mean the same thing as
 128      matching as many/few repetitions as possible. */
 129   unsigned int minimal:1;
 130 } tre_iteration_t;
 131
 132 /* An "union" node.  These are created for the "|" operator. */
 133 typedef struct {
 134   tre_ast_node_t *left;
 135   tre_ast_node_t *right;
 136 } tre_union_t;
 137
 138 static tre_ast_node_t *
 139 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
 140
 141 static tre_ast_node_t *
 142 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
 143
 144 static tre_ast_node_t *
 145 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
 146                  int minimal);
 147
 148 static tre_ast_node_t *
 149 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
 150
 151 static tre_ast_node_t *
 152 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
 153                        tre_ast_node_t *right);
 154
 155
 156 static tre_ast_node_t *
 157 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
 158 {
 159   tre_ast_node_t *node;
 160
 161   node = tre_mem_calloc(mem, sizeof(*node));
 162   if (!node)
 163     return NULL;
 164   node->obj = tre_mem_calloc(mem, size);
 165   if (!node->obj)
 166     return NULL;
 167   node->type = type;
 168   node->nullable = -1;
 169   node->submatch_id = -1;
 170
 171   return node;
 172 }
 173
 174 static tre_ast_node_t *
 175 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 176 {
 177   tre_ast_node_t *node;
 178   tre_literal_t *lit;
 179
 180   node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
 181   if (!node)
 182     return NULL;
 183   lit = node->obj;
 184   lit->code_min = code_min;
 185   lit->code_max = code_max;
 186   lit->position = position;
 187
 188   return node;
 189 }
 190
 191 static tre_ast_node_t *
 192 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
 193                  int minimal)
 194 {
 195   tre_ast_node_t *node;
 196   tre_iteration_t *iter;
 197
 198   node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
 199   if (!node)
 200     return NULL;
 201   iter = node->obj;
 202   iter->arg = arg;
 203   iter->min = min;
 204   iter->max = max;
 205   iter->minimal = minimal;
 206   node->num_submatches = arg->num_submatches;
 207
 208   return node;
 209 }
 210
 211 static tre_ast_node_t *
 212 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 213 {
 214   tre_ast_node_t *node;
 215
 216   node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
 217   if (node == NULL)
 218     return NULL;
 219   ((tre_union_t *)node->obj)->left = left;
 220   ((tre_union_t *)node->obj)->right = right;
 221   node->num_submatches = left->num_submatches + right->num_submatches;
 222
 223   return node;
 224 }
 225
 226 static tre_ast_node_t *
 227 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
 228                        tre_ast_node_t *right)
 229 {
 230   tre_ast_node_t *node;
 231
 232   node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
 233   if (node == NULL)
 234     return NULL;
 235   ((tre_catenation_t *)node->obj)->left = left;
 236   ((tre_catenation_t *)node->obj)->right = right;
 237   node->num_submatches = left->num_submatches + right->num_submatches;
 238
 239   return node;
 240 }
 241
 242
 243 /***********************************************************************
 244  from tre-stack.c and tre-stack.h
 245 ***********************************************************************/
 246
 247 typedef struct tre_stack_rec tre_stack_t;
 248
 249 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 250    is maximum size, and `increment' specifies how much more space will be
 251    allocated with realloc() if all space gets used up.  Returns the stack
 252    object or NULL if out of memory. */
 253 static tre_stack_t *
 254 tre_stack_new(int size, int max_size, int increment);
 255
 256 /* Frees the stack object. */
 257 static void
 258 tre_stack_destroy(tre_stack_t *s);
 259
 260 /* Returns the current number of objects in the stack. */
 261 static int
 262 tre_stack_num_objects(tre_stack_t *s);
 263
 264 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 265    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 266    This tries to realloc() more space before failing if maximum size
 267    has not yet been reached.  Returns REG_OK if successful. */
 268 #define declare_pushf(typetag, type)                                          \
 269   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 270
 271 declare_pushf(voidptr, void *);
 272 declare_pushf(int, int);
 273
 274 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 275    element off of stack `s' and returns it.  The stack must not be
 276    empty. */
 277 #define declare_popf(typetag, type)               \
 278   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 279
 280 declare_popf(voidptr, void *);
 281 declare_popf(int, int);
 282
 283 /* Just to save some typing. */
 284 #define STACK_PUSH(s, typetag, value)                                         \
 285   do                                                                          \
 286     {                                                                         \
 287       status = tre_stack_push_ ## typetag(s, value);                          \
 288     }                                                                         \
 289   while (/*CONSTCOND*/0)
 290
 291 #define STACK_PUSHX(s, typetag, value)                                        \
 292   {                                                                           \
 293     status = tre_stack_push_ ## typetag(s, value);                            \
 294     if (status != REG_OK)                                                     \
 295       break;                                                                  \
 296   }
 297
 298 #define STACK_PUSHR(s, typetag, value)                                        \
 299   {                                                                           \
 300     reg_errcode_t _status;                                                    \
 301     _status = tre_stack_push_ ## typetag(s, value);                           \
 302     if (_status != REG_OK)                                                    \
 303       return _status;                                                         \
 304   }
 305
 306 union tre_stack_item {
 307   void *voidptr_value;
 308   int int_value;
 309 };
 310
 311 struct tre_stack_rec {
 312   int size;
 313   int max_size;
 314   int increment;
 315   int ptr;
 316   union tre_stack_item *stack;
 317 };
 318
 319
 320 static tre_stack_t *
 321 tre_stack_new(int size, int max_size, int increment)
 322 {
 323   tre_stack_t *s;
 324
 325   s = xmalloc(sizeof(*s));
 326   if (s != NULL)
 327     {
 328       s->stack = xmalloc(sizeof(*s->stack) * size);
 329       if (s->stack == NULL)
 330         {
 331           xfree(s);
 332           return NULL;
 333         }
 334       s->size = size;
 335       s->max_size = max_size;
 336       s->increment = increment;
 337       s->ptr = 0;
 338     }
 339   return s;
 340 }
 341
 342 static void
 343 tre_stack_destroy(tre_stack_t *s)
 344 {
 345   xfree(s->stack);
 346   xfree(s);
 347 }
 348
 349 static int
 350 tre_stack_num_objects(tre_stack_t *s)
 351 {
 352   return s->ptr;
 353 }
 354
 355 static reg_errcode_t
 356 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 357 {
 358   if (s->ptr < s->size)
 359     {
 360       s->stack[s->ptr] = value;
 361       s->ptr++;
 362     }
 363   else
 364     {
 365       if (s->size >= s->max_size)
 366         {
 367           return REG_ESPACE;
 368         }
 369       else
 370         {
 371           union tre_stack_item *new_buffer;
 372           int new_size;
 373           new_size = s->size + s->increment;
 374           if (new_size > s->max_size)
 375             new_size = s->max_size;
 376           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 377           if (new_buffer == NULL)
 378             {
 379               return REG_ESPACE;
 380             }
 381           assert(new_size > s->size);
 382           s->size = new_size;
 383           s->stack = new_buffer;
 384           tre_stack_push(s, value);
 385         }
 386     }
 387   return REG_OK;
 388 }
 389
 390 #define define_pushf(typetag, type)  \
 391   declare_pushf(typetag, type) {     \
 392     union tre_stack_item item;       \
 393     item.typetag ## _value = value;  \
 394     return tre_stack_push(s, item);  \
 395 }
 396
 397 define_pushf(int, int)
 398 define_pushf(voidptr, void *)
 399
 400 #define define_popf(typetag, type)                  \
 401   declare_popf(typetag, type) {                     \
 402     return s->stack[--s->ptr].typetag ## _value;    \
 403   }
 404
 405 define_popf(int, int)
 406 define_popf(voidptr, void *)
 407
 408
 409 /***********************************************************************
 410  from tre-parse.c and tre-parse.h
 411 ***********************************************************************/
 412
 413 /* Parse context. */
 414 typedef struct {
 415   /* Memory allocator.  The AST is allocated using this. */
 416   tre_mem_t mem;
 417   /* Stack used for keeping track of regexp syntax. */
 418   tre_stack_t *stack;
 419   /* The parse result. */
 420   tre_ast_node_t *result;
 421   /* The regexp to parse and its length. */
 422   const char *re;
 423   /* The first character of the entire regexp. */
 424   const char *re_start;
 425   /* Current submatch ID. */
 426   int submatch_id;
 427   /* Current position (number of literal). */
 428   int position;
 429   /* The highest back reference or -1 if none seen so far. */
 430   int max_backref;
 431   /* This flag is set if the regexp uses approximate matching. */
 432   int have_approx;
 433   /* Compilation flags. */
 434   int cflags;
 435   /* If this flag is set the top-level submatch is not captured. */
 436   int nofirstsub;
 437 } tre_parse_ctx_t;
 438
 439 /* Parses a wide character regexp pattern into a syntax tree.  This parser
 440    handles both syntaxes (BRE and ERE), including the TRE extensions. */
 441 static reg_errcode_t
 442 tre_parse(tre_parse_ctx_t *ctx);
 443
 444
 445 /*
 446   This parser is just a simple recursive descent parser for POSIX.2
 447   regexps.  The parser supports both the obsolete default syntax and
 448   the "extended" syntax, and some nonstandard extensions.
 449 */
 450
 451 /* Characters with special meanings in regexp syntax. */
 452 #define CHAR_PIPE          '|'
 453 #define CHAR_LPAREN        '('
 454 #define CHAR_RPAREN        ')'
 455 #define CHAR_LBRACE        '{'
 456 #define CHAR_RBRACE        '}'
 457 #define CHAR_LBRACKET      '['
 458 #define CHAR_RBRACKET      ']'
 459 #define CHAR_MINUS         '-'
 460 #define CHAR_STAR          '*'
 461 #define CHAR_QUESTIONMARK  '?'
 462 #define CHAR_PLUS          '+'
 463 #define CHAR_PERIOD        '.'
 464 #define CHAR_COLON         ':'
 465 #define CHAR_EQUAL         '='
 466 #define CHAR_COMMA         ','
 467 #define CHAR_CARET         '^'
 468 #define CHAR_DOLLAR        '$'
 469 #define CHAR_BACKSLASH     '\\'
 470 #define CHAR_HASH          '#'
 471 #define CHAR_TILDE         '~'
 472
 473
 474 /* Some macros for expanding \w, \s, etc. */
 475 static const struct tre_macro_struct {
 476   const char c;
 477   const char *expansion;
 478 } tre_macros[] =
 479   { {'t', "\t"},           {'n', "\n"},            {'r', "\r"},
 480     {'f', "\f"},           {'a', "\a"},            {'e', "\033"},
 481     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 482     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
 483     { 0, NULL }
 484   };
 485
 486
 487 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 488    must have at least `len' items.  Sets buf[0] to zero if the there
 489    is no match in `tre_macros'. */
 490 static const char *
 491 tre_expand_macro(const char *regex)
 492 {
 493   int i;
 494
 495   if (!*regex)
 496     return 0;
 497
 498   for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
 499   return tre_macros[i].expansion;
 500 }
 501
 502 static reg_errcode_t
 503 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
 504          tre_ast_node_t ***items)
 505 {
 506   reg_errcode_t status;
 507   tre_ast_node_t **array = *items;
 508   /* Allocate more space if necessary. */
 509   if (*i >= *max_i)
 510     {
 511       tre_ast_node_t **new_items;
 512       /* If the array is already 1024 items large, give up -- there's
 513          probably an error in the regexp (e.g. not a '\0' terminated
 514          string and missing ']') */
 515       if (*max_i > 1024)
 516         return REG_ESPACE;
 517       *max_i *= 2;
 518       new_items = xrealloc(array, sizeof(*array) * *max_i);
 519       if (new_items == NULL)
 520         return REG_ESPACE;
 521       *items = array = new_items;
 522     }
 523   array[*i] = tre_ast_new_literal(mem, min, max, -1);
 524   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
 525   (*i)++;
 526   return status;
 527 }
 528
 529
 530 static int
 531 tre_compare_items(const void *a, const void *b)
 532 {
 533   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
 534   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
 535   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
 536   int a_min = l_a->code_min, b_min = l_b->code_min;
 537
 538   if (a_min < b_min)
 539     return -1;
 540   else if (a_min > b_min)
 541     return 1;
 542   else
 543     return 0;
 544 }
 545
 546 /* Maximum number of character classes that can occur in a negated bracket
 547    expression.  */
 548 #define MAX_NEG_CLASSES 64
 549
 550 /* Maximum length of character class names. */
 551 #define MAX_CLASS_NAME
 552
 553 static reg_errcode_t
 554 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
 555                         tre_ctype_t neg_classes[], int *num_neg_classes,
 556                         tre_ast_node_t ***items, int *num_items,
 557                         int *items_size)
 558 {
 559   const char *re = ctx->re;
 560   reg_errcode_t status = REG_OK;
 561   tre_ctype_t class = (tre_ctype_t)0;
 562   int i = *num_items;
 563   int max_i = *items_size;
 564   int skip;
 565
 566   /* Build an array of the items in the bracket expression. */
 567   while (status == REG_OK)
 568     {
 569       skip = 0;
 570       if (!*re)
 571         {
 572           status = REG_EBRACK;
 573         }
 574       else if (*re == CHAR_RBRACKET && re > ctx->re)
 575         {
 576           re++;
 577           break;
 578         }
 579       else
 580         {
 581           tre_cint_t min = 0, max = 0;
 582           wchar_t wc;
 583           int clen = mbtowc(&wc, re, -1);
 584
 585           if (clen<0) clen=1, wc=WEOF;
 586
 587           class = (tre_ctype_t)0;
 588           if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
 589             {
 590               min = wc;
 591               re += clen+1;
 592               clen = mbtowc(&wc, re, -1);
 593               if (clen<0) clen=1, wc=WEOF;
 594               max = wc;
 595               re += clen;
 596               /* XXX - Should use collation order instead of encoding values
 597                  in character ranges. */
 598               if (min > max)
 599                 status = REG_ERANGE;
 600             }
 601           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
 602             status = REG_ECOLLATE;
 603           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
 604             status = REG_ECOLLATE;
 605           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
 606             {
 607               char tmp_str[64];
 608               const char *endptr = re + 2;
 609               int len;
 610               while (*endptr && *endptr != CHAR_COLON)
 611                 endptr++;
 612               if (*endptr)
 613                 {
 614                   len = MIN(endptr - re - 2, 63);
 615                   strncpy(tmp_str, re + 2, len);
 616                   tmp_str[len] = '\0';
 617                   class = tre_ctype(tmp_str);
 618                   if (!class)
 619                     status = REG_ECTYPE;
 620                   re = endptr + 2;
 621                 }
 622               else
 623                 status = REG_ECTYPE;
 624               min = 0;
 625               max = TRE_CHAR_MAX;
 626             }
 627           else
 628             {
 629               if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
 630                   && ctx->re != re)
 631                 /* Two ranges are not allowed to share and endpoint. */
 632                 status = REG_ERANGE;
 633               min = max = wc;
 634               re += clen;
 635             }
 636
 637           if (status != REG_OK)
 638             break;
 639
 640           if (class && negate)
 641             if (*num_neg_classes >= MAX_NEG_CLASSES)
 642               status = REG_ESPACE;
 643             else
 644               neg_classes[(*num_neg_classes)++] = class;
 645           else if (!skip)
 646             {
 647               status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
 648               if (status != REG_OK)
 649                 break;
 650               ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
 651             }
 652
 653           /* Add opposite-case counterpoints if REG_ICASE is present.
 654              This is broken if there are more than two "same" characters. */
 655           if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
 656             {
 657               tre_cint_t cmin, ccurr;
 658
 659               while (min <= max)
 660                 {
 661                   if (tre_islower(min))
 662                     {
 663                       cmin = ccurr = tre_toupper(min++);
 664                       while (tre_islower(min) && tre_toupper(min) == ccurr + 1
 665                              && min <= max)
 666                         ccurr = tre_toupper(min++);
 667                       status = tre_new_item(ctx->mem, cmin, ccurr,
 668                                             &i, &max_i, items);
 669                     }
 670                   else if (tre_isupper(min))
 671                     {
 672                       cmin = ccurr = tre_tolower(min++);
 673                       while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
 674                              && min <= max)
 675                         ccurr = tre_tolower(min++);
 676                       status = tre_new_item(ctx->mem, cmin, ccurr,
 677                                             &i, &max_i, items);
 678                     }
 679                   else min++;
 680                   if (status != REG_OK)
 681                     break;
 682                 }
 683               if (status != REG_OK)
 684                 break;
 685             }
 686         }
 687     }
 688   *num_items = i;
 689   *items_size = max_i;
 690   ctx->re = re;
 691   return status;
 692 }
 693
 694 static reg_errcode_t
 695 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 696 {
 697   tre_ast_node_t *node = NULL;
 698   int negate = 0;
 699   reg_errcode_t status = REG_OK;
 700   tre_ast_node_t **items, *u, *n;
 701   int i = 0, j, max_i = 32, curr_max, curr_min;
 702   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
 703   int num_neg_classes = 0;
 704
 705   /* Start off with an array of `max_i' elements. */
 706   items = xmalloc(sizeof(*items) * max_i);
 707   if (items == NULL)
 708     return REG_ESPACE;
 709
 710   if (*ctx->re == CHAR_CARET)
 711     {
 712       negate = 1;
 713       ctx->re++;
 714     }
 715
 716   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
 717                                    &items, &i, &max_i);
 718
 719   if (status != REG_OK)
 720     goto parse_bracket_done;
 721
 722   /* Sort the array if we need to negate it. */
 723   if (negate)
 724     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
 725
 726   curr_max = curr_min = 0;
 727   /* Build a union of the items in the array, negated if necessary. */
 728   for (j = 0; j < i && status == REG_OK; j++)
 729     {
 730       int min, max;
 731       tre_literal_t *l = items[j]->obj;
 732       min = l->code_min;
 733       max = l->code_max;
 734
 735       if (negate)
 736         {
 737           if (min < curr_max)
 738             {
 739               /* Overlap. */
 740               curr_max = MAX(max + 1, curr_max);
 741               l = NULL;
 742             }
 743           else
 744             {
 745               /* No overlap. */
 746               curr_max = min - 1;
 747               if (curr_max >= curr_min)
 748                 {
 749                   l->code_min = curr_min;
 750                   l->code_max = curr_max;
 751                 }
 752               else
 753                 {
 754                   l = NULL;
 755                 }
 756               curr_min = curr_max = max + 1;
 757             }
 758         }
 759
 760       if (l != NULL)
 761         {
 762           int k;
 763           l->position = ctx->position;
 764           if (num_neg_classes > 0)
 765             {
 766               l->neg_classes = tre_mem_alloc(ctx->mem,
 767                                              (sizeof(*l->neg_classes)
 768                                               * (num_neg_classes + 1)));
 769               if (l->neg_classes == NULL)
 770                 {
 771                   status = REG_ESPACE;
 772                   break;
 773                 }
 774               for (k = 0; k < num_neg_classes; k++)
 775                 l->neg_classes[k] = neg_classes[k];
 776               l->neg_classes[k] = (tre_ctype_t)0;
 777             }
 778           else
 779             l->neg_classes = NULL;
 780           if (node == NULL)
 781             node = items[j];
 782           else
 783             {
 784               u = tre_ast_new_union(ctx->mem, node, items[j]);
 785               if (u == NULL)
 786                 status = REG_ESPACE;
 787               node = u;
 788             }
 789         }
 790     }
 791
 792   if (status != REG_OK)
 793     goto parse_bracket_done;
 794
 795   if (negate)
 796     {
 797       int k;
 798       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
 799       if (n == NULL)
 800         status = REG_ESPACE;
 801       else
 802         {
 803           tre_literal_t *l = n->obj;
 804           if (num_neg_classes > 0)
 805             {
 806               l->neg_classes = tre_mem_alloc(ctx->mem,
 807                                              (sizeof(*l->neg_classes)
 808                                               * (num_neg_classes + 1)));
 809               if (l->neg_classes == NULL)
 810                 {
 811                   status = REG_ESPACE;
 812                   goto parse_bracket_done;
 813                 }
 814               for (k = 0; k < num_neg_classes; k++)
 815                 l->neg_classes[k] = neg_classes[k];
 816               l->neg_classes[k] = (tre_ctype_t)0;
 817             }
 818           else
 819             l->neg_classes = NULL;
 820           if (node == NULL)
 821             node = n;
 822           else
 823             {
 824               u = tre_ast_new_union(ctx->mem, node, n);
 825               if (u == NULL)
 826                 status = REG_ESPACE;
 827               node = u;
 828             }
 829         }
 830     }
 831
 832   if (status != REG_OK)
 833     goto parse_bracket_done;
 834
 835 #ifdef TRE_DEBUG
 836   tre_ast_print(node);
 837 #endif /* TRE_DEBUG */
 838
 839  parse_bracket_done:
 840   xfree(items);
 841   ctx->position++;
 842   *result = node;
 843   return status;
 844 }
 845
 846
 847 /* Parses a positive decimal integer.  Returns -1 if the string does not
 848    contain a valid number. */
 849 static int
 850 tre_parse_int(const char **regex)
 851 {
 852   int num = -1;
 853   const char *r = *regex;
 854   while (*r-'0'<10U)
 855     {
 856       if (num < 0)
 857         num = 0;
 858       num = num * 10 + *r - '0';
 859       r++;
 860     }
 861   *regex = r;
 862   return num;
 863 }
 864
 865
 866 static reg_errcode_t
 867 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 868 {
 869   int min, max;
 870   const char *r = ctx->re;
 871   int minimal = 0;
 872
 873   /* Parse number (minimum repetition count). */
 874   min = -1;
 875   if (*r >= '0' && *r <= '9') {
 876     min = tre_parse_int(&r);
 877   }
 878
 879   /* Parse comma and second number (maximum repetition count). */
 880   max = min;
 881   if (*r == CHAR_COMMA)
 882     {
 883       r++;
 884       max = tre_parse_int(&r);
 885     }
 886
 887   /* Check that the repeat counts are sane. */
 888   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
 889     return REG_BADBR;
 890
 891   /* Missing }. */
 892   if (!*r)
 893     return REG_EBRACE;
 894
 895   /* Empty contents of {}. */
 896   if (r == ctx->re)
 897     return REG_BADBR;
 898
 899   /* Parse the ending '}' or '\}'.*/
 900   if (ctx->cflags & REG_EXTENDED)
 901     {
 902       if (*r != CHAR_RBRACE)
 903         return REG_BADBR;
 904       r++;
 905     }
 906   else
 907     {
 908       if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
 909         return REG_BADBR;
 910       r += 2;
 911     }
 912
 913   /* Create the AST node(s). */
 914   if (min == 0 && max == 0)
 915     {
 916       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 917       if (*result == NULL)
 918         return REG_ESPACE;
 919     }
 920   else
 921     {
 922       if (min < 0 && max < 0)
 923         /* Only approximate parameters set, no repetitions. */
 924         min = max = 1;
 925
 926       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
 927       if (!*result)
 928         return REG_ESPACE;
 929     }
 930
 931   ctx->re = r;
 932   return REG_OK;
 933 }
 934
 935 typedef enum {
 936   PARSE_RE = 0,
 937   PARSE_ATOM,
 938   PARSE_MARK_FOR_SUBMATCH,
 939   PARSE_BRANCH,
 940   PARSE_PIECE,
 941   PARSE_CATENATION,
 942   PARSE_POST_CATENATION,
 943   PARSE_UNION,
 944   PARSE_POST_UNION,
 945   PARSE_POSTFIX,
 946   PARSE_RESTORE_CFLAGS
 947 } tre_parse_re_stack_symbol_t;
 948
 949
 950 static reg_errcode_t
 951 tre_parse(tre_parse_ctx_t *ctx)
 952 {
 953   tre_ast_node_t *result = NULL;
 954   tre_parse_re_stack_symbol_t symbol;
 955   reg_errcode_t status = REG_OK;
 956   tre_stack_t *stack = ctx->stack;
 957   int bottom = tre_stack_num_objects(stack);
 958   int depth = 0;
 959   wchar_t wc;
 960   int clen;
 961
 962   if (!ctx->nofirstsub)
 963     {
 964       STACK_PUSH(stack, int, ctx->submatch_id);
 965       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
 966       ctx->submatch_id++;
 967     }
 968   STACK_PUSH(stack, int, PARSE_RE);
 969   ctx->re_start = ctx->re;
 970
 971
 972   /* The following is basically just a recursive descent parser.  I use
 973      an explicit stack instead of recursive functions mostly because of
 974      two reasons: compatibility with systems which have an overflowable
 975      call stack, and efficiency (both in lines of code and speed).  */
 976   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
 977     {
 978       if (status != REG_OK)
 979         break;
 980       symbol = tre_stack_pop_int(stack);
 981       switch (symbol)
 982         {
 983         case PARSE_RE:
 984           /* Parse a full regexp.  A regexp is one or more branches,
 985              separated by the union operator `|'. */
 986           if (ctx->cflags & REG_EXTENDED)
 987             STACK_PUSHX(stack, int, PARSE_UNION);
 988           STACK_PUSHX(stack, int, PARSE_BRANCH);
 989           break;
 990
 991         case PARSE_BRANCH:
 992           /* Parse a branch.  A branch is one or more pieces, concatenated.
 993              A piece is an atom possibly followed by a postfix operator. */
 994           STACK_PUSHX(stack, int, PARSE_CATENATION);
 995           STACK_PUSHX(stack, int, PARSE_PIECE);
 996           break;
 997
 998         case PARSE_PIECE:
 999           /* Parse a piece.  A piece is an atom possibly followed by one
1000              or more postfix operators. */
1001             STACK_PUSHX(stack, int, PARSE_POSTFIX);
1002           STACK_PUSHX(stack, int, PARSE_ATOM);
1003           break;
1004
1005         case PARSE_CATENATION:
1006           /* If the expression has not ended, parse another piece. */
1007           {
1008             tre_char_t c;
1009             if (!*ctx->re)
1010               break;
1011             c = *ctx->re;
1012                 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1013                   break;
1014                 if ((ctx->cflags & REG_EXTENDED
1015                      && c == CHAR_RPAREN && depth > 0)
1016                     || (!(ctx->cflags & REG_EXTENDED)
1017                         && (c == CHAR_BACKSLASH
1018                             && *(ctx->re + 1) == CHAR_RPAREN)))
1019                   {
1020                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1021                       status = REG_EPAREN;
1022                     depth--;
1023                     if (!(ctx->cflags & REG_EXTENDED))
1024                       ctx->re += 2;
1025                     break;
1026                   }
1027
1028               {
1029                 /* Default case, left associative concatenation. */
1030                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1031                 STACK_PUSHX(stack, voidptr, result);
1032                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1033                 STACK_PUSHX(stack, int, PARSE_PIECE);
1034               }
1035             break;
1036           }
1037
1038         case PARSE_POST_CATENATION:
1039           {
1040             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1041             tre_ast_node_t *tmp_node;
1042             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1043             if (!tmp_node)
1044               return REG_ESPACE;
1045             result = tmp_node;
1046             break;
1047           }
1048
1049         case PARSE_UNION:
1050           switch (*ctx->re)
1051             {
1052             case CHAR_PIPE:
1053               STACK_PUSHX(stack, int, PARSE_UNION);
1054               STACK_PUSHX(stack, voidptr, result);
1055               STACK_PUSHX(stack, int, PARSE_POST_UNION);
1056               STACK_PUSHX(stack, int, PARSE_BRANCH);
1057               ctx->re++;
1058               break;
1059
1060             case CHAR_RPAREN:
1061               ctx->re++;
1062               break;
1063
1064             default:
1065               break;
1066             }
1067           break;
1068
1069         case PARSE_POST_UNION:
1070           {
1071             tre_ast_node_t *tmp_node;
1072             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1073             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1074             if (!tmp_node)
1075               return REG_ESPACE;
1076             result = tmp_node;
1077             break;
1078           }
1079
1080         case PARSE_POSTFIX:
1081           /* Parse postfix operators. */
1082           switch (*ctx->re)
1083             {
1084             case CHAR_PLUS:
1085             case CHAR_QUESTIONMARK:
1086               if (!(ctx->cflags & REG_EXTENDED))
1087                 break;
1088                 /*FALLTHROUGH*/
1089             case CHAR_STAR:
1090               {
1091                 tre_ast_node_t *tmp_node;
1092                 int minimal = 0;
1093                 int rep_min = 0;
1094                 int rep_max = -1;
1095
1096                 if (*ctx->re == CHAR_PLUS)
1097                   rep_min = 1;
1098                 if (*ctx->re == CHAR_QUESTIONMARK)
1099                   rep_max = 1;
1100
1101                 ctx->re++;
1102                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1103                                             minimal);
1104                 if (tmp_node == NULL)
1105                   return REG_ESPACE;
1106                 result = tmp_node;
1107                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1108               }
1109               break;
1110
1111             case CHAR_BACKSLASH:
1112               /* "\{" is special without REG_EXTENDED */
1113               if (!(ctx->cflags & REG_EXTENDED)
1114                   && *(ctx->re + 1) == CHAR_LBRACE)
1115                 {
1116                   ctx->re++;
1117                   goto parse_brace;
1118                 }
1119               else
1120                 break;
1121
1122             case CHAR_LBRACE:
1123               /* "{" is literal without REG_EXTENDED */
1124               if (!(ctx->cflags & REG_EXTENDED))
1125                 break;
1126
1127             parse_brace:
1128               ctx->re++;
1129
1130               status = tre_parse_bound(ctx, &result);
1131               if (status != REG_OK)
1132                 return status;
1133               STACK_PUSHX(stack, int, PARSE_POSTFIX);
1134               break;
1135             }
1136           break;
1137
1138         case PARSE_ATOM:
1139           /* Parse an atom.  An atom is a regular expression enclosed in `()',
1140              an empty set of `()', a bracket expression, `.', `^', `$',
1141              a `\' followed by a character, or a single character. */
1142
1143           switch (*ctx->re)
1144             {
1145             case CHAR_LPAREN:  /* parenthesized subexpression */
1146
1147               if (ctx->cflags & REG_EXTENDED)
1148                 {
1149                 lparen:
1150                   depth++;
1151                     {
1152                       ctx->re++;
1153                       /* First parse a whole RE, then mark the resulting tree
1154                          for submatching. */
1155                       STACK_PUSHX(stack, int, ctx->submatch_id);
1156                       STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1157                       STACK_PUSHX(stack, int, PARSE_RE);
1158                       ctx->submatch_id++;
1159                     }
1160                 }
1161               else
1162                 goto parse_literal;
1163               break;
1164
1165             case CHAR_LBRACKET: /* bracket expression */
1166               ctx->re++;
1167               status = tre_parse_bracket(ctx, &result);
1168               if (status != REG_OK)
1169                 return status;
1170               break;
1171
1172             case CHAR_BACKSLASH:
1173               /* If this is "\(" or "\)" chew off the backslash and
1174                  try again. */
1175               if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
1176                 {
1177                   ctx->re++;
1178                   goto lparen;
1179                 }
1180               if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
1181                 {
1182                   goto empty_atom;
1183                 }
1184
1185               /* If a macro is used, parse the expanded macro recursively. */
1186               {
1187                 const char *buf = tre_expand_macro(ctx->re + 1);
1188                 if (buf)
1189                   {
1190                     tre_parse_ctx_t subctx;
1191                     memcpy(&subctx, ctx, sizeof(subctx));
1192                     subctx.re = buf;
1193                     subctx.nofirstsub = 1;
1194                     status = tre_parse(&subctx);
1195                     if (status != REG_OK)
1196                       return status;
1197                     ctx->re += 2;
1198                     ctx->position = subctx.position;
1199                     result = subctx.result;
1200                     break;
1201                   }
1202               }
1203
1204               if (!ctx->re[1])
1205                 /* Trailing backslash. */
1206                 return REG_EESCAPE;
1207
1208               ctx->re++;
1209               switch (*ctx->re)
1210                 {
1211                 case 'b':
1212                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1213                                                ASSERT_AT_WB, -1);
1214                   ctx->re++;
1215                   break;
1216                 case 'B':
1217                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1218                                                ASSERT_AT_WB_NEG, -1);
1219                   ctx->re++;
1220                   break;
1221                 case '<':
1222                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1223                                                ASSERT_AT_BOW, -1);
1224                   ctx->re++;
1225                   break;
1226                 case '>':
1227                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1228                                                ASSERT_AT_EOW, -1);
1229                   ctx->re++;
1230                   break;
1231                 case 'x':
1232                   ctx->re++;
1233                   if (ctx->re[0] != CHAR_LBRACE)
1234                     {
1235                       /* 8 bit hex char. */
1236                       char tmp[3] = {0, 0, 0};
1237                       long val;
1238
1239                       if (tre_isxdigit(ctx->re[0]))
1240                         {
1241                           tmp[0] = (char)ctx->re[0];
1242                           ctx->re++;
1243                         }
1244                       if (tre_isxdigit(ctx->re[0]))
1245                         {
1246                           tmp[1] = (char)ctx->re[0];
1247                           ctx->re++;
1248                         }
1249                       val = strtol(tmp, NULL, 16);
1250                       result = tre_ast_new_literal(ctx->mem, (int)val,
1251                                                    (int)val, ctx->position);
1252                       ctx->position++;
1253                       break;
1254                     }
1255                   else if (*ctx->re)
1256                     {
1257                       /* Wide char. */
1258                       char tmp[32];
1259                       long val;
1260                       int i = 0;
1261                       ctx->re++;
1262                       while (*ctx->re && i < sizeof tmp)
1263                         {
1264                           if (ctx->re[0] == CHAR_RBRACE)
1265                             break;
1266                           if (tre_isxdigit(ctx->re[0]))
1267                             {
1268                               tmp[i] = (char)ctx->re[0];
1269                               i++;
1270                               ctx->re++;
1271                               continue;
1272                             }
1273                           return REG_EBRACE;
1274                         }
1275                       ctx->re++;
1276                       tmp[i] = 0;
1277                       val = strtol(tmp, NULL, 16);
1278                       result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1279                                                    ctx->position);
1280                       ctx->position++;
1281                       break;
1282                     }
1283                   /*FALLTHROUGH*/
1284
1285                 default:
1286                   if (tre_isdigit(*ctx->re))
1287                     {
1288                       /* Back reference. */
1289                       int val = *ctx->re - '0';
1290                       result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1291                                                    ctx->position);
1292                       if (result == NULL)
1293                         return REG_ESPACE;
1294                       ctx->position++;
1295                       ctx->max_backref = MAX(val, ctx->max_backref);
1296                       ctx->re++;
1297                     }
1298                   else
1299                     {
1300                       /* Escaped character. */
1301                       result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1302                                                    ctx->position);
1303                       ctx->position++;
1304                       ctx->re++;
1305                     }
1306                   break;
1307                 }
1308               if (result == NULL)
1309                 return REG_ESPACE;
1310               break;
1311
1312             case CHAR_PERIOD:    /* the any-symbol */
1313               if (ctx->cflags & REG_NEWLINE)
1314                 {
1315                   tre_ast_node_t *tmp1;
1316                   tre_ast_node_t *tmp2;
1317                   tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1318                                              ctx->position);
1319                   if (!tmp1)
1320                     return REG_ESPACE;
1321                   tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1322                                              ctx->position + 1);
1323                   if (!tmp2)
1324                     return REG_ESPACE;
1325                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1326                   if (!result)
1327                     return REG_ESPACE;
1328                   ctx->position += 2;
1329                 }
1330               else
1331                 {
1332                   result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1333                                                ctx->position);
1334                   if (!result)
1335                     return REG_ESPACE;
1336                   ctx->position++;
1337                 }
1338               ctx->re++;
1339               break;
1340
1341             case CHAR_CARET:     /* beginning of line assertion */
1342               /* '^' has a special meaning everywhere in EREs, and at
1343                  beginning of BRE. */
1344               if (ctx->cflags & REG_EXTENDED
1345                   || ctx->re == ctx->re_start)
1346                 {
1347                   if (!(ctx->cflags & REG_EXTENDED))
1348                     STACK_PUSHX(stack, int, PARSE_CATENATION);
1349                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1350                                                ASSERT_AT_BOL, -1);
1351                   if (result == NULL)
1352                     return REG_ESPACE;
1353                   ctx->re++;
1354                 }
1355               else
1356                 goto parse_literal;
1357               break;
1358
1359             case CHAR_DOLLAR:    /* end of line assertion. */
1360               /* '$' is special everywhere in EREs, and in the end of the
1361                  string in BREs. */
1362               if (ctx->cflags & REG_EXTENDED
1363                   || !*(ctx->re + 1))
1364                 {
1365                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1366                                                ASSERT_AT_EOL, -1);
1367                   if (result == NULL)
1368                     return REG_ESPACE;
1369                   ctx->re++;
1370                 }
1371               else
1372                 goto parse_literal;
1373               break;
1374
1375             case CHAR_RPAREN:
1376               if (!depth)
1377                 goto parse_literal;
1378             case CHAR_STAR:
1379             case CHAR_PIPE:
1380             case CHAR_LBRACE:
1381             case CHAR_PLUS:
1382             case CHAR_QUESTIONMARK:
1383               if (!(ctx->cflags & REG_EXTENDED))
1384                 goto parse_literal;
1385
1386             case 0:
1387             empty_atom:
1388               result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1389               if (!result)
1390                 return REG_ESPACE;
1391               break;
1392
1393             default:
1394             parse_literal:
1395
1396               clen = mbtowc(&wc, ctx->re, -1);
1397               if (clen<0) clen=1, wc=WEOF;
1398
1399               /* Note that we can't use an tre_isalpha() test here, since there
1400                  may be characters which are alphabetic but neither upper or
1401                  lower case. */
1402               if (ctx->cflags & REG_ICASE
1403                   && (tre_isupper(wc) || tre_islower(wc)))
1404                 {
1405                   tre_ast_node_t *tmp1;
1406                   tre_ast_node_t *tmp2;
1407
1408                   /* XXX - Can there be more than one opposite-case
1409                      counterpoints for some character in some locale?  Or
1410                      more than two characters which all should be regarded
1411                      the same character if case is ignored?  If yes, there
1412                      does not seem to be a portable way to detect it.  I guess
1413                      that at least for multi-character collating elements there
1414                      could be several opposite-case counterpoints, but they
1415                      cannot be supported portably anyway. */
1416                   tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1417                                              tre_toupper(wc),
1418                                              ctx->position);
1419                   if (!tmp1)
1420                     return REG_ESPACE;
1421                   tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1422                                              tre_tolower(wc),
1423                                              ctx->position);
1424                   if (!tmp2)
1425                     return REG_ESPACE;
1426                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1427                   if (!result)
1428                     return REG_ESPACE;
1429                 }
1430               else
1431                 {
1432                   result = tre_ast_new_literal(ctx->mem, wc, wc,
1433                                                ctx->position);
1434                   if (!result)
1435                     return REG_ESPACE;
1436                 }
1437               ctx->position++;
1438               ctx->re += clen;
1439               break;
1440             }
1441           break;
1442
1443         case PARSE_MARK_FOR_SUBMATCH:
1444           {
1445             int submatch_id = tre_stack_pop_int(stack);
1446
1447             if (result->submatch_id >= 0)
1448               {
1449                 tre_ast_node_t *n, *tmp_node;
1450                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1451                 if (n == NULL)
1452                   return REG_ESPACE;
1453                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1454                 if (tmp_node == NULL)
1455                   return REG_ESPACE;
1456                 tmp_node->num_submatches = result->num_submatches;
1457                 result = tmp_node;
1458               }
1459             result->submatch_id = submatch_id;
1460             result->num_submatches++;
1461             break;
1462           }
1463
1464         case PARSE_RESTORE_CFLAGS:
1465           ctx->cflags = tre_stack_pop_int(stack);
1466           break;
1467
1468         default:
1469           assert(0);
1470           break;
1471         }
1472     }
1473
1474   /* Check for missing closing parentheses. */
1475   if (depth > 0)
1476     return REG_EPAREN;
1477
1478   if (status == REG_OK)
1479     ctx->result = result;
1480
1481   return status;
1482 }
1483
1484
1485
1486 /***********************************************************************
1487  from tre-compile.c
1488 ***********************************************************************/
1489
1490
1491 /*
1492   TODO:
1493    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1494      function calls.
1495 */
1496
1497 /*
1498   Algorithms to setup tags so that submatch addressing can be done.
1499 */
1500
1501
1502 /* Inserts a catenation node to the root of the tree given in `node'.
1503    As the left child a new tag with number `tag_id' to `node' is added,
1504    and the right child is the old root. */
1505 static reg_errcode_t
1506 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1507 {
1508   tre_catenation_t *c;
1509
1510   c = tre_mem_alloc(mem, sizeof(*c));
1511   if (c == NULL)
1512     return REG_ESPACE;
1513   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1514   if (c->left == NULL)
1515     return REG_ESPACE;
1516   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1517   if (c->right == NULL)
1518     return REG_ESPACE;
1519
1520   c->right->obj = node->obj;
1521   c->right->type = node->type;
1522   c->right->nullable = -1;
1523   c->right->submatch_id = -1;
1524   c->right->firstpos = NULL;
1525   c->right->lastpos = NULL;
1526   c->right->num_tags = 0;
1527   node->obj = c;
1528   node->type = CATENATION;
1529   return REG_OK;
1530 }
1531
1532 /* Inserts a catenation node to the root of the tree given in `node'.
1533    As the right child a new tag with number `tag_id' to `node' is added,
1534    and the left child is the old root. */
1535 static reg_errcode_t
1536 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1537 {
1538   tre_catenation_t *c;
1539
1540   c = tre_mem_alloc(mem, sizeof(*c));
1541   if (c == NULL)
1542     return REG_ESPACE;
1543   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1544   if (c->right == NULL)
1545     return REG_ESPACE;
1546   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1547   if (c->left == NULL)
1548     return REG_ESPACE;
1549
1550   c->left->obj = node->obj;
1551   c->left->type = node->type;
1552   c->left->nullable = -1;
1553   c->left->submatch_id = -1;
1554   c->left->firstpos = NULL;
1555   c->left->lastpos = NULL;
1556   c->left->num_tags = 0;
1557   node->obj = c;
1558   node->type = CATENATION;
1559   return REG_OK;
1560 }
1561
1562 typedef enum {
1563   ADDTAGS_RECURSE,
1564   ADDTAGS_AFTER_ITERATION,
1565   ADDTAGS_AFTER_UNION_LEFT,
1566   ADDTAGS_AFTER_UNION_RIGHT,
1567   ADDTAGS_AFTER_CAT_LEFT,
1568   ADDTAGS_AFTER_CAT_RIGHT,
1569   ADDTAGS_SET_SUBMATCH_END
1570 } tre_addtags_symbol_t;
1571
1572
1573 typedef struct {
1574   int tag;
1575   int next_tag;
1576 } tre_tag_states_t;
1577
1578
1579 /* Go through `regset' and set submatch data for submatches that are
1580    using this tag. */
1581 static void
1582 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1583 {
1584   int i;
1585
1586   for (i = 0; regset[i] >= 0; i++)
1587     {
1588       int id = regset[i] / 2;
1589       int start = !(regset[i] % 2);
1590       if (start)
1591         tnfa->submatch_data[id].so_tag = tag;
1592       else
1593         tnfa->submatch_data[id].eo_tag = tag;
1594     }
1595   regset[0] = -1;
1596 }
1597
1598
1599 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1600    subexpressions marked for submatch addressing can be traced. */
1601 static reg_errcode_t
1602 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1603              tre_tnfa_t *tnfa)
1604 {
1605   reg_errcode_t status = REG_OK;
1606   tre_addtags_symbol_t symbol;
1607   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1608   int bottom = tre_stack_num_objects(stack);
1609   /* True for first pass (counting number of needed tags) */
1610   int first_pass = (mem == NULL || tnfa == NULL);
1611   int *regset, *orig_regset;
1612   int num_tags = 0; /* Total number of tags. */
1613   int num_minimals = 0;  /* Number of special minimal tags. */
1614   int tag = 0;      /* The tag that is to be added next. */
1615   int next_tag = 1; /* Next tag to use after this one. */
1616   int *parents;     /* Stack of submatches the current submatch is
1617                        contained in. */
1618   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1619   tre_tag_states_t *saved_states;
1620
1621   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1622   if (!first_pass)
1623     {
1624       tnfa->end_tag = 0;
1625       tnfa->minimal_tags[0] = -1;
1626     }
1627
1628   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1629   if (regset == NULL)
1630     return REG_ESPACE;
1631   regset[0] = -1;
1632   orig_regset = regset;
1633
1634   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1635   if (parents == NULL)
1636     {
1637       xfree(regset);
1638       return REG_ESPACE;
1639     }
1640   parents[0] = -1;
1641
1642   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1643   if (saved_states == NULL)
1644     {
1645       xfree(regset);
1646       xfree(parents);
1647       return REG_ESPACE;
1648     }
1649   else
1650     {
1651       unsigned int i;
1652       for (i = 0; i <= tnfa->num_submatches; i++)
1653         saved_states[i].tag = -1;
1654     }
1655
1656   STACK_PUSH(stack, voidptr, node);
1657   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1658
1659   while (tre_stack_num_objects(stack) > bottom)
1660     {
1661       if (status != REG_OK)
1662         break;
1663
1664       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1665       switch (symbol)
1666         {
1667
1668         case ADDTAGS_SET_SUBMATCH_END:
1669           {
1670             int id = tre_stack_pop_int(stack);
1671             int i;
1672
1673             /* Add end of this submatch to regset. */
1674             for (i = 0; regset[i] >= 0; i++);
1675             regset[i] = id * 2 + 1;
1676             regset[i + 1] = -1;
1677
1678             /* Pop this submatch from the parents stack. */
1679             for (i = 0; parents[i] >= 0; i++);
1680             parents[i - 1] = -1;
1681             break;
1682           }
1683
1684         case ADDTAGS_RECURSE:
1685           node = tre_stack_pop_voidptr(stack);
1686
1687           if (node->submatch_id >= 0)
1688             {
1689               int id = node->submatch_id;
1690               int i;
1691
1692
1693               /* Add start of this submatch to regset. */
1694               for (i = 0; regset[i] >= 0; i++);
1695               regset[i] = id * 2;
1696               regset[i + 1] = -1;
1697
1698               if (!first_pass)
1699                 {
1700                   for (i = 0; parents[i] >= 0; i++);
1701                   tnfa->submatch_data[id].parents = NULL;
1702                   if (i > 0)
1703                     {
1704                       int *p = xmalloc(sizeof(*p) * (i + 1));
1705                       if (p == NULL)
1706                         {
1707                           status = REG_ESPACE;
1708                           break;
1709                         }
1710                       assert(tnfa->submatch_data[id].parents == NULL);
1711                       tnfa->submatch_data[id].parents = p;
1712                       for (i = 0; parents[i] >= 0; i++)
1713                         p[i] = parents[i];
1714                       p[i] = -1;
1715                     }
1716                 }
1717
1718               /* Add end of this submatch to regset after processing this
1719                  node. */
1720               STACK_PUSHX(stack, int, node->submatch_id);
1721               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1722             }
1723
1724           switch (node->type)
1725             {
1726             case LITERAL:
1727               {
1728                 tre_literal_t *lit = node->obj;
1729
1730                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1731                   {
1732                     int i;
1733                     if (regset[0] >= 0)
1734                       {
1735                         /* Regset is not empty, so add a tag before the
1736                            literal or backref. */
1737                         if (!first_pass)
1738                           {
1739                             status = tre_add_tag_left(mem, node, tag);
1740                             tnfa->tag_directions[tag] = direction;
1741                             if (minimal_tag >= 0)
1742                               {
1743                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1744                                 tnfa->minimal_tags[i] = tag;
1745                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1746                                 tnfa->minimal_tags[i + 2] = -1;
1747                                 minimal_tag = -1;
1748                                 num_minimals++;
1749                               }
1750                             tre_purge_regset(regset, tnfa, tag);
1751                           }
1752                         else
1753                           {
1754                             node->num_tags = 1;
1755                           }
1756
1757                         regset[0] = -1;
1758                         tag = next_tag;
1759                         num_tags++;
1760                         next_tag++;
1761                       }
1762                   }
1763                 else
1764                   {
1765                     assert(!IS_TAG(lit));
1766                   }
1767                 break;
1768               }
1769             case CATENATION:
1770               {
1771                 tre_catenation_t *cat = node->obj;
1772                 tre_ast_node_t *left = cat->left;
1773                 tre_ast_node_t *right = cat->right;
1774                 int reserved_tag = -1;
1775
1776
1777                 /* After processing right child. */
1778                 STACK_PUSHX(stack, voidptr, node);
1779                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1780
1781                 /* Process right child. */
1782                 STACK_PUSHX(stack, voidptr, right);
1783                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1784
1785                 /* After processing left child. */
1786                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1787                 if (left->num_tags > 0 && right->num_tags > 0)
1788                   {
1789                     /* Reserve the next tag to the right child. */
1790                     reserved_tag = next_tag;
1791                     next_tag++;
1792                   }
1793                 STACK_PUSHX(stack, int, reserved_tag);
1794                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1795
1796                 /* Process left child. */
1797                 STACK_PUSHX(stack, voidptr, left);
1798                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1799
1800                 }
1801               break;
1802             case ITERATION:
1803               {
1804                 tre_iteration_t *iter = node->obj;
1805
1806                 if (first_pass)
1807                   {
1808                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1809                   }
1810                 else
1811                   {
1812                     STACK_PUSHX(stack, int, tag);
1813                     STACK_PUSHX(stack, int, iter->minimal);
1814                   }
1815                 STACK_PUSHX(stack, voidptr, node);
1816                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1817
1818                 STACK_PUSHX(stack, voidptr, iter->arg);
1819                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1820
1821                 /* Regset is not empty, so add a tag here. */
1822                 if (regset[0] >= 0 || iter->minimal)
1823                   {
1824                     if (!first_pass)
1825                       {
1826                         int i;
1827                         status = tre_add_tag_left(mem, node, tag);
1828                         if (iter->minimal)
1829                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1830                         else
1831                           tnfa->tag_directions[tag] = direction;
1832                         if (minimal_tag >= 0)
1833                           {
1834                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1835                             tnfa->minimal_tags[i] = tag;
1836                             tnfa->minimal_tags[i + 1] = minimal_tag;
1837                             tnfa->minimal_tags[i + 2] = -1;
1838                             minimal_tag = -1;
1839                             num_minimals++;
1840                           }
1841                         tre_purge_regset(regset, tnfa, tag);
1842                       }
1843
1844                     regset[0] = -1;
1845                     tag = next_tag;
1846                     num_tags++;
1847                     next_tag++;
1848                   }
1849                 direction = TRE_TAG_MINIMIZE;
1850               }
1851               break;
1852             case UNION:
1853               {
1854                 tre_union_t *uni = node->obj;
1855                 tre_ast_node_t *left = uni->left;
1856                 tre_ast_node_t *right = uni->right;
1857                 int left_tag;
1858                 int right_tag;
1859
1860                 if (regset[0] >= 0)
1861                   {
1862                     left_tag = next_tag;
1863                     right_tag = next_tag + 1;
1864                   }
1865                 else
1866                   {
1867                     left_tag = tag;
1868                     right_tag = next_tag;
1869                   }
1870
1871                 /* After processing right child. */
1872                 STACK_PUSHX(stack, int, right_tag);
1873                 STACK_PUSHX(stack, int, left_tag);
1874                 STACK_PUSHX(stack, voidptr, regset);
1875                 STACK_PUSHX(stack, int, regset[0] >= 0);
1876                 STACK_PUSHX(stack, voidptr, node);
1877                 STACK_PUSHX(stack, voidptr, right);
1878                 STACK_PUSHX(stack, voidptr, left);
1879                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1880
1881                 /* Process right child. */
1882                 STACK_PUSHX(stack, voidptr, right);
1883                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1884
1885                 /* After processing left child. */
1886                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1887
1888                 /* Process left child. */
1889                 STACK_PUSHX(stack, voidptr, left);
1890                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1891
1892                 /* Regset is not empty, so add a tag here. */
1893                 if (regset[0] >= 0)
1894                   {
1895                     if (!first_pass)
1896                       {
1897                         int i;
1898                         status = tre_add_tag_left(mem, node, tag);
1899                         tnfa->tag_directions[tag] = direction;
1900                         if (minimal_tag >= 0)
1901                           {
1902                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1903                             tnfa->minimal_tags[i] = tag;
1904                             tnfa->minimal_tags[i + 1] = minimal_tag;
1905                             tnfa->minimal_tags[i + 2] = -1;
1906                             minimal_tag = -1;
1907                             num_minimals++;
1908                           }
1909                         tre_purge_regset(regset, tnfa, tag);
1910                       }
1911
1912                     regset[0] = -1;
1913                     tag = next_tag;
1914                     num_tags++;
1915                     next_tag++;
1916                   }
1917
1918                 if (node->num_submatches > 0)
1919                   {
1920                     /* The next two tags are reserved for markers. */
1921                     next_tag++;
1922                     tag = next_tag;
1923                     next_tag++;
1924                   }
1925
1926                 break;
1927               }
1928             }
1929
1930           if (node->submatch_id >= 0)
1931             {
1932               int i;
1933               /* Push this submatch on the parents stack. */
1934               for (i = 0; parents[i] >= 0; i++);
1935               parents[i] = node->submatch_id;
1936               parents[i + 1] = -1;
1937             }
1938
1939           break; /* end case: ADDTAGS_RECURSE */
1940
1941         case ADDTAGS_AFTER_ITERATION:
1942           {
1943             int minimal = 0;
1944             int enter_tag;
1945             node = tre_stack_pop_voidptr(stack);
1946             if (first_pass)
1947               {
1948                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1949                   + tre_stack_pop_int(stack);
1950                 minimal_tag = -1;
1951               }
1952             else
1953               {
1954                 minimal = tre_stack_pop_int(stack);
1955                 enter_tag = tre_stack_pop_int(stack);
1956                 if (minimal)
1957                   minimal_tag = enter_tag;
1958               }
1959
1960             if (!first_pass)
1961               {
1962                 if (minimal)
1963                   direction = TRE_TAG_MINIMIZE;
1964                 else
1965                   direction = TRE_TAG_MAXIMIZE;
1966               }
1967             break;
1968           }
1969
1970         case ADDTAGS_AFTER_CAT_LEFT:
1971           {
1972             int new_tag = tre_stack_pop_int(stack);
1973             next_tag = tre_stack_pop_int(stack);
1974             if (new_tag >= 0)
1975               {
1976                 tag = new_tag;
1977               }
1978             break;
1979           }
1980
1981         case ADDTAGS_AFTER_CAT_RIGHT:
1982           node = tre_stack_pop_voidptr(stack);
1983           if (first_pass)
1984             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1985               + ((tre_catenation_t *)node->obj)->right->num_tags;
1986           break;
1987
1988         case ADDTAGS_AFTER_UNION_LEFT:
1989           /* Lift the bottom of the `regset' array so that when processing
1990              the right operand the items currently in the array are
1991              invisible.  The original bottom was saved at ADDTAGS_UNION and
1992              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1993           while (*regset >= 0)
1994             regset++;
1995           break;
1996
1997         case ADDTAGS_AFTER_UNION_RIGHT:
1998           {
1999             int added_tags, tag_left, tag_right;
2000             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2001             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2002             node = tre_stack_pop_voidptr(stack);
2003             added_tags = tre_stack_pop_int(stack);
2004             if (first_pass)
2005               {
2006                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2007                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2008                   + ((node->num_submatches > 0) ? 2 : 0);
2009               }
2010             regset = tre_stack_pop_voidptr(stack);
2011             tag_left = tre_stack_pop_int(stack);
2012             tag_right = tre_stack_pop_int(stack);
2013
2014             /* Add tags after both children, the left child gets a smaller
2015                tag than the right child.  This guarantees that we prefer
2016                the left child over the right child. */
2017             /* XXX - This is not always necessary (if the children have
2018                tags which must be seen for every match of that child). */
2019             /* XXX - Check if this is the only place where tre_add_tag_right
2020                is used.  If so, use tre_add_tag_left (putting the tag before
2021                the child as opposed after the child) and throw away
2022                tre_add_tag_right. */
2023             if (node->num_submatches > 0)
2024               {
2025                 if (!first_pass)
2026                   {
2027                     status = tre_add_tag_right(mem, left, tag_left);
2028                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2029                     status = tre_add_tag_right(mem, right, tag_right);
2030                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2031                   }
2032                 num_tags += 2;
2033               }
2034             direction = TRE_TAG_MAXIMIZE;
2035             break;
2036           }
2037
2038         default:
2039           assert(0);
2040           break;
2041
2042         } /* end switch(symbol) */
2043     } /* end while(tre_stack_num_objects(stack) > bottom) */
2044
2045   if (!first_pass)
2046     tre_purge_regset(regset, tnfa, tag);
2047
2048   if (!first_pass && minimal_tag >= 0)
2049     {
2050       int i;
2051       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2052       tnfa->minimal_tags[i] = tag;
2053       tnfa->minimal_tags[i + 1] = minimal_tag;
2054       tnfa->minimal_tags[i + 2] = -1;
2055       minimal_tag = -1;
2056       num_minimals++;
2057     }
2058
2059   assert(tree->num_tags == num_tags);
2060   tnfa->end_tag = num_tags;
2061   tnfa->num_tags = num_tags;
2062   tnfa->num_minimals = num_minimals;
2063   xfree(orig_regset);
2064   xfree(parents);
2065   xfree(saved_states);
2066   return status;
2067 }
2068
2069
2070
2071 /*
2072   AST to TNFA compilation routines.
2073 */
2074
2075 typedef enum {
2076   COPY_RECURSE,
2077   COPY_SET_RESULT_PTR
2078 } tre_copyast_symbol_t;
2079
2080 /* Flags for tre_copy_ast(). */
2081 #define COPY_REMOVE_TAGS         1
2082 #define COPY_MAXIMIZE_FIRST_TAG  2
2083
2084 static reg_errcode_t
2085 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2086              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2087              tre_ast_node_t **copy, int *max_pos)
2088 {
2089   reg_errcode_t status = REG_OK;
2090   int bottom = tre_stack_num_objects(stack);
2091   int num_copied = 0;
2092   int first_tag = 1;
2093   tre_ast_node_t **result = copy;
2094   tre_copyast_symbol_t symbol;
2095
2096   STACK_PUSH(stack, voidptr, ast);
2097   STACK_PUSH(stack, int, COPY_RECURSE);
2098
2099   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2100     {
2101       tre_ast_node_t *node;
2102       if (status != REG_OK)
2103         break;
2104
2105       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2106       switch (symbol)
2107         {
2108         case COPY_SET_RESULT_PTR:
2109           result = tre_stack_pop_voidptr(stack);
2110           break;
2111         case COPY_RECURSE:
2112           node = tre_stack_pop_voidptr(stack);
2113           switch (node->type)
2114             {
2115             case LITERAL:
2116               {
2117                 tre_literal_t *lit = node->obj;
2118                 int pos = lit->position;
2119                 int min = lit->code_min;
2120                 int max = lit->code_max;
2121                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2122                   {
2123                     /* XXX - e.g. [ab] has only one position but two
2124                        nodes, so we are creating holes in the state space
2125                        here.  Not fatal, just wastes memory. */
2126                     pos += *pos_add;
2127                     num_copied++;
2128                   }
2129                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2130                   {
2131                     /* Change this tag to empty. */
2132                     min = EMPTY;
2133                     max = pos = -1;
2134                   }
2135                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2136                          && first_tag)
2137                   {
2138                     /* Maximize the first tag. */
2139                     tag_directions[max] = TRE_TAG_MAXIMIZE;
2140                     first_tag = 0;
2141                   }
2142                 *result = tre_ast_new_literal(mem, min, max, pos);
2143                 if (*result == NULL)
2144                   status = REG_ESPACE;
2145
2146                 if (pos > *max_pos)
2147                   *max_pos = pos;
2148                 break;
2149               }
2150             case UNION:
2151               {
2152                 tre_union_t *uni = node->obj;
2153                 tre_union_t *tmp;
2154                 *result = tre_ast_new_union(mem, uni->left, uni->right);
2155                 if (*result == NULL)
2156                   {
2157                     status = REG_ESPACE;
2158                     break;
2159                   }
2160                 tmp = (*result)->obj;
2161                 result = &tmp->left;
2162                 STACK_PUSHX(stack, voidptr, uni->right);
2163                 STACK_PUSHX(stack, int, COPY_RECURSE);
2164                 STACK_PUSHX(stack, voidptr, &tmp->right);
2165                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2166                 STACK_PUSHX(stack, voidptr, uni->left);
2167                 STACK_PUSHX(stack, int, COPY_RECURSE);
2168                 break;
2169               }
2170             case CATENATION:
2171               {
2172                 tre_catenation_t *cat = node->obj;
2173                 tre_catenation_t *tmp;
2174                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2175                 if (*result == NULL)
2176                   {
2177                     status = REG_ESPACE;
2178                     break;
2179                   }
2180                 tmp = (*result)->obj;
2181                 tmp->left = NULL;
2182                 tmp->right = NULL;
2183                 result = &tmp->left;
2184
2185                 STACK_PUSHX(stack, voidptr, cat->right);
2186                 STACK_PUSHX(stack, int, COPY_RECURSE);
2187                 STACK_PUSHX(stack, voidptr, &tmp->right);
2188                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2189                 STACK_PUSHX(stack, voidptr, cat->left);
2190                 STACK_PUSHX(stack, int, COPY_RECURSE);
2191                 break;
2192               }
2193             case ITERATION:
2194               {
2195                 tre_iteration_t *iter = node->obj;
2196                 STACK_PUSHX(stack, voidptr, iter->arg);
2197                 STACK_PUSHX(stack, int, COPY_RECURSE);
2198                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2199                                            iter->max, iter->minimal);
2200                 if (*result == NULL)
2201                   {
2202                     status = REG_ESPACE;
2203                     break;
2204                   }
2205                 iter = (*result)->obj;
2206                 result = &iter->arg;
2207                 break;
2208               }
2209             default:
2210               assert(0);
2211               break;
2212             }
2213           break;
2214         }
2215     }
2216   *pos_add += num_copied;
2217   return status;
2218 }
2219
2220 typedef enum {
2221   EXPAND_RECURSE,
2222   EXPAND_AFTER_ITER
2223 } tre_expand_ast_symbol_t;
2224
2225 /* Expands each iteration node that has a finite nonzero minimum or maximum
2226    iteration count to a catenated sequence of copies of the node. */
2227 static reg_errcode_t
2228 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2229                int *position, tre_tag_direction_t *tag_directions)
2230 {
2231   reg_errcode_t status = REG_OK;
2232   int bottom = tre_stack_num_objects(stack);
2233   int pos_add = 0;
2234   int pos_add_total = 0;
2235   int max_pos = 0;
2236   int iter_depth = 0;
2237
2238   STACK_PUSHR(stack, voidptr, ast);
2239   STACK_PUSHR(stack, int, EXPAND_RECURSE);
2240   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2241     {
2242       tre_ast_node_t *node;
2243       tre_expand_ast_symbol_t symbol;
2244
2245       if (status != REG_OK)
2246         break;
2247
2248       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2249       node = tre_stack_pop_voidptr(stack);
2250       switch (symbol)
2251         {
2252         case EXPAND_RECURSE:
2253           switch (node->type)
2254             {
2255             case LITERAL:
2256               {
2257                 tre_literal_t *lit= node->obj;
2258                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2259                   {
2260                     lit->position += pos_add;
2261                     if (lit->position > max_pos)
2262                       max_pos = lit->position;
2263                   }
2264                 break;
2265               }
2266             case UNION:
2267               {
2268                 tre_union_t *uni = node->obj;
2269                 STACK_PUSHX(stack, voidptr, uni->right);
2270                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2271                 STACK_PUSHX(stack, voidptr, uni->left);
2272                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2273                 break;
2274               }
2275             case CATENATION:
2276               {
2277                 tre_catenation_t *cat = node->obj;
2278                 STACK_PUSHX(stack, voidptr, cat->right);
2279                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2280                 STACK_PUSHX(stack, voidptr, cat->left);
2281                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2282                 break;
2283               }
2284             case ITERATION:
2285               {
2286                 tre_iteration_t *iter = node->obj;
2287                 STACK_PUSHX(stack, int, pos_add);
2288                 STACK_PUSHX(stack, voidptr, node);
2289                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2290                 STACK_PUSHX(stack, voidptr, iter->arg);
2291                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2292                 /* If we are going to expand this node at EXPAND_AFTER_ITER
2293                    then don't increase the `pos' fields of the nodes now, it
2294                    will get done when expanding. */
2295                 if (iter->min > 1 || iter->max > 1)
2296                   pos_add = 0;
2297                 iter_depth++;
2298                 break;
2299               }
2300             default:
2301               assert(0);
2302               break;
2303             }
2304           break;
2305         case EXPAND_AFTER_ITER:
2306           {
2307             tre_iteration_t *iter = node->obj;
2308             int pos_add_last;
2309             pos_add = tre_stack_pop_int(stack);
2310             pos_add_last = pos_add;
2311             if (iter->min > 1 || iter->max > 1)
2312               {
2313                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2314                 int j;
2315                 int pos_add_save = pos_add;
2316
2317                 /* Create a catenated sequence of copies of the node. */
2318                 for (j = 0; j < iter->min; j++)
2319                   {
2320                     tre_ast_node_t *copy;
2321                     /* Remove tags from all but the last copy. */
2322                     int flags = ((j + 1 < iter->min)
2323                                  ? COPY_REMOVE_TAGS
2324                                  : COPY_MAXIMIZE_FIRST_TAG);
2325                     pos_add_save = pos_add;
2326                     status = tre_copy_ast(mem, stack, iter->arg, flags,
2327                                           &pos_add, tag_directions, &copy,
2328                                           &max_pos);
2329                     if (status != REG_OK)
2330                       return status;
2331                     if (seq1 != NULL)
2332                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
2333                     else
2334                       seq1 = copy;
2335                     if (seq1 == NULL)
2336                       return REG_ESPACE;
2337                   }
2338
2339                 if (iter->max == -1)
2340                   {
2341                     /* No upper limit. */
2342                     pos_add_save = pos_add;
2343                     status = tre_copy_ast(mem, stack, iter->arg, 0,
2344                                           &pos_add, NULL, &seq2, &max_pos);
2345                     if (status != REG_OK)
2346                       return status;
2347                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2348                     if (seq2 == NULL)
2349                       return REG_ESPACE;
2350                   }
2351                 else
2352                   {
2353                     for (j = iter->min; j < iter->max; j++)
2354                       {
2355                         tre_ast_node_t *tmp, *copy;
2356                         pos_add_save = pos_add;
2357                         status = tre_copy_ast(mem, stack, iter->arg, 0,
2358                                               &pos_add, NULL, &copy, &max_pos);
2359                         if (status != REG_OK)
2360                           return status;
2361                         if (seq2 != NULL)
2362                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
2363                         else
2364                           seq2 = copy;
2365                         if (seq2 == NULL)
2366                           return REG_ESPACE;
2367                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2368                         if (tmp == NULL)
2369                           return REG_ESPACE;
2370                         seq2 = tre_ast_new_union(mem, tmp, seq2);
2371                         if (seq2 == NULL)
2372                           return REG_ESPACE;
2373                       }
2374                   }
2375
2376                 pos_add = pos_add_save;
2377                 if (seq1 == NULL)
2378                   seq1 = seq2;
2379                 else if (seq2 != NULL)
2380                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2381                 if (seq1 == NULL)
2382                   return REG_ESPACE;
2383                 node->obj = seq1->obj;
2384                 node->type = seq1->type;
2385               }
2386
2387             iter_depth--;
2388             pos_add_total += pos_add - pos_add_last;
2389             if (iter_depth == 0)
2390               pos_add = pos_add_total;
2391
2392             break;
2393           }
2394         default:
2395           assert(0);
2396           break;
2397         }
2398     }
2399
2400   *position += pos_add_total;
2401
2402   /* `max_pos' should never be larger than `*position' if the above
2403      code works, but just an extra safeguard let's make sure
2404      `*position' is set large enough so enough memory will be
2405      allocated for the transition table. */
2406   if (max_pos > *position)
2407     *position = max_pos;
2408
2409   return status;
2410 }
2411
2412 static tre_pos_and_tags_t *
2413 tre_set_empty(tre_mem_t mem)
2414 {
2415   tre_pos_and_tags_t *new_set;
2416
2417   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2418   if (new_set == NULL)
2419     return NULL;
2420
2421   new_set[0].position = -1;
2422   new_set[0].code_min = -1;
2423   new_set[0].code_max = -1;
2424
2425   return new_set;
2426 }
2427
2428 static tre_pos_and_tags_t *
2429 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2430             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2431 {
2432   tre_pos_and_tags_t *new_set;
2433
2434   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2435   if (new_set == NULL)
2436     return NULL;
2437
2438   new_set[0].position = position;
2439   new_set[0].code_min = code_min;
2440   new_set[0].code_max = code_max;
2441   new_set[0].class = class;
2442   new_set[0].neg_classes = neg_classes;
2443   new_set[0].backref = backref;
2444   new_set[1].position = -1;
2445   new_set[1].code_min = -1;
2446   new_set[1].code_max = -1;
2447
2448   return new_set;
2449 }
2450
2451 static tre_pos_and_tags_t *
2452 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2453               int *tags, int assertions)
2454 {
2455   int s1, s2, i, j;
2456   tre_pos_and_tags_t *new_set;
2457   int *new_tags;
2458   int num_tags;
2459
2460   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2461   for (s1 = 0; set1[s1].position >= 0; s1++);
2462   for (s2 = 0; set2[s2].position >= 0; s2++);
2463   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2464   if (!new_set )
2465     return NULL;
2466
2467   for (s1 = 0; set1[s1].position >= 0; s1++)
2468     {
2469       new_set[s1].position = set1[s1].position;
2470       new_set[s1].code_min = set1[s1].code_min;
2471       new_set[s1].code_max = set1[s1].code_max;
2472       new_set[s1].assertions = set1[s1].assertions | assertions;
2473       new_set[s1].class = set1[s1].class;
2474       new_set[s1].neg_classes = set1[s1].neg_classes;
2475       new_set[s1].backref = set1[s1].backref;
2476       if (set1[s1].tags == NULL && tags == NULL)
2477         new_set[s1].tags = NULL;
2478       else
2479         {
2480           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2481           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2482                                          * (i + num_tags + 1)));
2483           if (new_tags == NULL)
2484             return NULL;
2485           for (j = 0; j < i; j++)
2486             new_tags[j] = set1[s1].tags[j];
2487           for (i = 0; i < num_tags; i++)
2488             new_tags[j + i] = tags[i];
2489           new_tags[j + i] = -1;
2490           new_set[s1].tags = new_tags;
2491         }
2492     }
2493
2494   for (s2 = 0; set2[s2].position >= 0; s2++)
2495     {
2496       new_set[s1 + s2].position = set2[s2].position;
2497       new_set[s1 + s2].code_min = set2[s2].code_min;
2498       new_set[s1 + s2].code_max = set2[s2].code_max;
2499       /* XXX - why not | assertions here as well? */
2500       new_set[s1 + s2].assertions = set2[s2].assertions;
2501       new_set[s1 + s2].class = set2[s2].class;
2502       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2503       new_set[s1 + s2].backref = set2[s2].backref;
2504       if (set2[s2].tags == NULL)
2505         new_set[s1 + s2].tags = NULL;
2506       else
2507         {
2508           for (i = 0; set2[s2].tags[i] >= 0; i++);
2509           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2510           if (new_tags == NULL)
2511             return NULL;
2512           for (j = 0; j < i; j++)
2513             new_tags[j] = set2[s2].tags[j];
2514           new_tags[j] = -1;
2515           new_set[s1 + s2].tags = new_tags;
2516         }
2517     }
2518   new_set[s1 + s2].position = -1;
2519   return new_set;
2520 }
2521
2522 /* Finds the empty path through `node' which is the one that should be
2523    taken according to POSIX.2 rules, and adds the tags on that path to
2524    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2525    set to the number of tags seen on the path. */
2526 static reg_errcode_t
2527 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2528                 int *assertions, int *num_tags_seen)
2529 {
2530   tre_literal_t *lit;
2531   tre_union_t *uni;
2532   tre_catenation_t *cat;
2533   tre_iteration_t *iter;
2534   int i;
2535   int bottom = tre_stack_num_objects(stack);
2536   reg_errcode_t status = REG_OK;
2537   if (num_tags_seen)
2538     *num_tags_seen = 0;
2539
2540   status = tre_stack_push_voidptr(stack, node);
2541
2542   /* Walk through the tree recursively. */
2543   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2544     {
2545       node = tre_stack_pop_voidptr(stack);
2546
2547       switch (node->type)
2548         {
2549         case LITERAL:
2550           lit = (tre_literal_t *)node->obj;
2551           switch (lit->code_min)
2552             {
2553             case TAG:
2554               if (lit->code_max >= 0)
2555                 {
2556                   if (tags != NULL)
2557                     {
2558                       /* Add the tag to `tags'. */
2559                       for (i = 0; tags[i] >= 0; i++)
2560                         if (tags[i] == lit->code_max)
2561                           break;
2562                       if (tags[i] < 0)
2563                         {
2564                           tags[i] = lit->code_max;
2565                           tags[i + 1] = -1;
2566                         }
2567                     }
2568                   if (num_tags_seen)
2569                     (*num_tags_seen)++;
2570                 }
2571               break;
2572             case ASSERTION:
2573               assert(lit->code_max >= 1
2574                      || lit->code_max <= ASSERT_LAST);
2575               if (assertions != NULL)
2576                 *assertions |= lit->code_max;
2577               break;
2578             case EMPTY:
2579               break;
2580             default:
2581               assert(0);
2582               break;
2583             }
2584           break;
2585
2586         case UNION:
2587           /* Subexpressions starting earlier take priority over ones
2588              starting later, so we prefer the left subexpression over the
2589              right subexpression. */
2590           uni = (tre_union_t *)node->obj;
2591           if (uni->left->nullable)
2592             STACK_PUSHX(stack, voidptr, uni->left)
2593           else if (uni->right->nullable)
2594             STACK_PUSHX(stack, voidptr, uni->right)
2595           else
2596             assert(0);
2597           break;
2598
2599         case CATENATION:
2600           /* The path must go through both children. */
2601           cat = (tre_catenation_t *)node->obj;
2602           assert(cat->left->nullable);
2603           assert(cat->right->nullable);
2604           STACK_PUSHX(stack, voidptr, cat->left);
2605           STACK_PUSHX(stack, voidptr, cat->right);
2606           break;
2607
2608         case ITERATION:
2609           /* A match with an empty string is preferred over no match at
2610              all, so we go through the argument if possible. */
2611           iter = (tre_iteration_t *)node->obj;
2612           if (iter->arg->nullable)
2613             STACK_PUSHX(stack, voidptr, iter->arg);
2614           break;
2615
2616         default:
2617           assert(0);
2618           break;
2619         }
2620     }
2621
2622   return status;
2623 }
2624
2625
2626 typedef enum {
2627   NFL_RECURSE,
2628   NFL_POST_UNION,
2629   NFL_POST_CATENATION,
2630   NFL_POST_ITERATION
2631 } tre_nfl_stack_symbol_t;
2632
2633
2634 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2635    the nodes of the AST `tree'. */
2636 static reg_errcode_t
2637 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2638 {
2639   int bottom = tre_stack_num_objects(stack);
2640
2641   STACK_PUSHR(stack, voidptr, tree);
2642   STACK_PUSHR(stack, int, NFL_RECURSE);
2643
2644   while (tre_stack_num_objects(stack) > bottom)
2645     {
2646       tre_nfl_stack_symbol_t symbol;
2647       tre_ast_node_t *node;
2648
2649       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2650       node = tre_stack_pop_voidptr(stack);
2651       switch (symbol)
2652         {
2653         case NFL_RECURSE:
2654           switch (node->type)
2655             {
2656             case LITERAL:
2657               {
2658                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2659                 if (IS_BACKREF(lit))
2660                   {
2661                     /* Back references: nullable = false, firstpos = {i},
2662                        lastpos = {i}. */
2663                     node->nullable = 0;
2664                     node->firstpos = tre_set_one(mem, lit->position, 0,
2665                                              TRE_CHAR_MAX, 0, NULL, -1);
2666                     if (!node->firstpos)
2667                       return REG_ESPACE;
2668                     node->lastpos = tre_set_one(mem, lit->position, 0,
2669                                                 TRE_CHAR_MAX, 0, NULL,
2670                                                 (int)lit->code_max);
2671                     if (!node->lastpos)
2672                       return REG_ESPACE;
2673                   }
2674                 else if (lit->code_min < 0)
2675                   {
2676                     /* Tags, empty strings, params, and zero width assertions:
2677                        nullable = true, firstpos = {}, and lastpos = {}. */
2678                     node->nullable = 1;
2679                     node->firstpos = tre_set_empty(mem);
2680                     if (!node->firstpos)
2681                       return REG_ESPACE;
2682                     node->lastpos = tre_set_empty(mem);
2683                     if (!node->lastpos)
2684                       return REG_ESPACE;
2685                   }
2686                 else
2687                   {
2688                     /* Literal at position i: nullable = false, firstpos = {i},
2689                        lastpos = {i}. */
2690                     node->nullable = 0;
2691                     node->firstpos =
2692                       tre_set_one(mem, lit->position, (int)lit->code_min,
2693                                   (int)lit->code_max, 0, NULL, -1);
2694                     if (!node->firstpos)
2695                       return REG_ESPACE;
2696                     node->lastpos = tre_set_one(mem, lit->position,
2697                                                 (int)lit->code_min,
2698                                                 (int)lit->code_max,
2699                                                 lit->class, lit->neg_classes,
2700                                                 -1);
2701                     if (!node->lastpos)
2702                       return REG_ESPACE;
2703                   }
2704                 break;
2705               }
2706
2707             case UNION:
2708               /* Compute the attributes for the two subtrees, and after that
2709                  for this node. */
2710               STACK_PUSHR(stack, voidptr, node);
2711               STACK_PUSHR(stack, int, NFL_POST_UNION);
2712               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2713               STACK_PUSHR(stack, int, NFL_RECURSE);
2714               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2715               STACK_PUSHR(stack, int, NFL_RECURSE);
2716               break;
2717
2718             case CATENATION:
2719               /* Compute the attributes for the two subtrees, and after that
2720                  for this node. */
2721               STACK_PUSHR(stack, voidptr, node);
2722               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2723               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2724               STACK_PUSHR(stack, int, NFL_RECURSE);
2725               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2726               STACK_PUSHR(stack, int, NFL_RECURSE);
2727               break;
2728
2729             case ITERATION:
2730               /* Compute the attributes for the subtree, and after that for
2731                  this node. */
2732               STACK_PUSHR(stack, voidptr, node);
2733               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2734               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2735               STACK_PUSHR(stack, int, NFL_RECURSE);
2736               break;
2737             }
2738           break; /* end case: NFL_RECURSE */
2739
2740         case NFL_POST_UNION:
2741           {
2742             tre_union_t *uni = (tre_union_t *)node->obj;
2743             node->nullable = uni->left->nullable || uni->right->nullable;
2744             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2745                                            uni->right->firstpos, NULL, 0);
2746             if (!node->firstpos)
2747               return REG_ESPACE;
2748             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2749                                           uni->right->lastpos, NULL, 0);
2750             if (!node->lastpos)
2751               return REG_ESPACE;
2752             break;
2753           }
2754
2755         case NFL_POST_ITERATION:
2756           {
2757             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2758
2759             if (iter->min == 0 || iter->arg->nullable)
2760               node->nullable = 1;
2761             else
2762               node->nullable = 0;
2763             node->firstpos = iter->arg->firstpos;
2764             node->lastpos = iter->arg->lastpos;
2765             break;
2766           }
2767
2768         case NFL_POST_CATENATION:
2769           {
2770             int num_tags, *tags, assertions;
2771             reg_errcode_t status;
2772             tre_catenation_t *cat = node->obj;
2773             node->nullable = cat->left->nullable && cat->right->nullable;
2774
2775             /* Compute firstpos. */
2776             if (cat->left->nullable)
2777               {
2778                 /* The left side matches the empty string.  Make a first pass
2779                    with tre_match_empty() to get the number of tags and
2780                    parameters. */
2781                 status = tre_match_empty(stack, cat->left,
2782                                          NULL, NULL, &num_tags);
2783                 if (status != REG_OK)
2784                   return status;
2785                 /* Allocate arrays for the tags and parameters. */
2786                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2787                 if (!tags)
2788                   return REG_ESPACE;
2789                 tags[0] = -1;
2790                 assertions = 0;
2791                 /* Second pass with tre_mach_empty() to get the list of
2792                    tags and parameters. */
2793                 status = tre_match_empty(stack, cat->left, tags,
2794                                          &assertions, NULL);
2795                 if (status != REG_OK)
2796                   {
2797                     xfree(tags);
2798                     return status;
2799                   }
2800                 node->firstpos =
2801                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2802                                 tags, assertions);
2803                 xfree(tags);
2804                 if (!node->firstpos)
2805                   return REG_ESPACE;
2806               }
2807             else
2808               {
2809                 node->firstpos = cat->left->firstpos;
2810               }
2811
2812             /* Compute lastpos. */
2813             if (cat->right->nullable)
2814               {
2815                 /* The right side matches the empty string.  Make a first pass
2816                    with tre_match_empty() to get the number of tags and
2817                    parameters. */
2818                 status = tre_match_empty(stack, cat->right,
2819                                          NULL, NULL, &num_tags);
2820                 if (status != REG_OK)
2821                   return status;
2822                 /* Allocate arrays for the tags and parameters. */
2823                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2824                 if (!tags)
2825                   return REG_ESPACE;
2826                 tags[0] = -1;
2827                 assertions = 0;
2828                 /* Second pass with tre_mach_empty() to get the list of
2829                    tags and parameters. */
2830                 status = tre_match_empty(stack, cat->right, tags,
2831                                          &assertions, NULL);
2832                 if (status != REG_OK)
2833                   {
2834                     xfree(tags);
2835                     return status;
2836                   }
2837                 node->lastpos =
2838                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2839                                 tags, assertions);
2840                 xfree(tags);
2841                 if (!node->lastpos)
2842                   return REG_ESPACE;
2843               }
2844             else
2845               {
2846                 node->lastpos = cat->right->lastpos;
2847               }
2848             break;
2849           }
2850
2851         default:
2852           assert(0);
2853           break;
2854         }
2855     }
2856
2857   return REG_OK;
2858 }
2859
2860
2861 /* Adds a transition from each position in `p1' to each position in `p2'. */
2862 static reg_errcode_t
2863 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2864                tre_tnfa_transition_t *transitions,
2865                int *counts, int *offs)
2866 {
2867   tre_pos_and_tags_t *orig_p2 = p2;
2868   tre_tnfa_transition_t *trans;
2869   int i, j, k, l, dup, prev_p2_pos;
2870
2871   if (transitions != NULL)
2872     while (p1->position >= 0)
2873       {
2874         p2 = orig_p2;
2875         prev_p2_pos = -1;
2876         while (p2->position >= 0)
2877           {
2878             /* Optimization: if this position was already handled, skip it. */
2879             if (p2->position == prev_p2_pos)
2880               {
2881                 p2++;
2882                 continue;
2883               }
2884             prev_p2_pos = p2->position;
2885             /* Set `trans' to point to the next unused transition from
2886                position `p1->position'. */
2887             trans = transitions + offs[p1->position];
2888             while (trans->state != NULL)
2889               {
2890 #if 0
2891                 /* If we find a previous transition from `p1->position' to
2892                    `p2->position', it is overwritten.  This can happen only
2893                    if there are nested loops in the regexp, like in "((a)*)*".
2894                    In POSIX.2 repetition using the outer loop is always
2895                    preferred over using the inner loop.  Therefore the
2896                    transition for the inner loop is useless and can be thrown
2897                    away. */
2898                 /* XXX - The same position is used for all nodes in a bracket
2899                    expression, so this optimization cannot be used (it will
2900                    break bracket expressions) unless I figure out a way to
2901                    detect it here. */
2902                 if (trans->state_id == p2->position)
2903                   {
2904                     break;
2905                   }
2906 #endif
2907                 trans++;
2908               }
2909
2910             if (trans->state == NULL)
2911               (trans + 1)->state = NULL;
2912             /* Use the character ranges, assertions, etc. from `p1' for
2913                the transition from `p1' to `p2'. */
2914             trans->code_min = p1->code_min;
2915             trans->code_max = p1->code_max;
2916             trans->state = transitions + offs[p2->position];
2917             trans->state_id = p2->position;
2918             trans->assertions = p1->assertions | p2->assertions
2919               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2920               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2921             if (p1->backref >= 0)
2922               {
2923                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2924                 assert(p2->backref < 0);
2925                 trans->u.backref = p1->backref;
2926                 trans->assertions |= ASSERT_BACKREF;
2927               }
2928             else
2929               trans->u.class = p1->class;
2930             if (p1->neg_classes != NULL)
2931               {
2932                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2933                 trans->neg_classes =
2934                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2935                 if (trans->neg_classes == NULL)
2936                   return REG_ESPACE;
2937                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2938                   trans->neg_classes[i] = p1->neg_classes[i];
2939                 trans->neg_classes[i] = (tre_ctype_t)0;
2940               }
2941             else
2942               trans->neg_classes = NULL;
2943
2944             /* Find out how many tags this transition has. */
2945             i = 0;
2946             if (p1->tags != NULL)
2947               while(p1->tags[i] >= 0)
2948                 i++;
2949             j = 0;
2950             if (p2->tags != NULL)
2951               while(p2->tags[j] >= 0)
2952                 j++;
2953
2954             /* If we are overwriting a transition, free the old tag array. */
2955             if (trans->tags != NULL)
2956               xfree(trans->tags);
2957             trans->tags = NULL;
2958
2959             /* If there were any tags, allocate an array and fill it. */
2960             if (i + j > 0)
2961               {
2962                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2963                 if (!trans->tags)
2964                   return REG_ESPACE;
2965                 i = 0;
2966                 if (p1->tags != NULL)
2967                   while(p1->tags[i] >= 0)
2968                     {
2969                       trans->tags[i] = p1->tags[i];
2970                       i++;
2971                     }
2972                 l = i;
2973                 j = 0;
2974                 if (p2->tags != NULL)
2975                   while (p2->tags[j] >= 0)
2976                     {
2977                       /* Don't add duplicates. */
2978                       dup = 0;
2979                       for (k = 0; k < i; k++)
2980                         if (trans->tags[k] == p2->tags[j])
2981                           {
2982                             dup = 1;
2983                             break;
2984                           }
2985                       if (!dup)
2986                         trans->tags[l++] = p2->tags[j];
2987                       j++;
2988                     }
2989                 trans->tags[l] = -1;
2990               }
2991
2992             p2++;
2993           }
2994         p1++;
2995       }
2996   else
2997     /* Compute a maximum limit for the number of transitions leaving
2998        from each state. */
2999     while (p1->position >= 0)
3000       {
3001         p2 = orig_p2;
3002         while (p2->position >= 0)
3003           {
3004             counts[p1->position]++;
3005             p2++;
3006           }
3007         p1++;
3008       }
3009   return REG_OK;
3010 }
3011
3012 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
3013    labelled with one character range (there are no transitions on empty
3014    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
3015    the regexp. */
3016 static reg_errcode_t
3017 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3018                 int *counts, int *offs)
3019 {
3020   tre_union_t *uni;
3021   tre_catenation_t *cat;
3022   tre_iteration_t *iter;
3023   reg_errcode_t errcode = REG_OK;
3024
3025   /* XXX - recurse using a stack!. */
3026   switch (node->type)
3027     {
3028     case LITERAL:
3029       break;
3030     case UNION:
3031       uni = (tre_union_t *)node->obj;
3032       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3033       if (errcode != REG_OK)
3034         return errcode;
3035       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3036       break;
3037
3038     case CATENATION:
3039       cat = (tre_catenation_t *)node->obj;
3040       /* Add a transition from each position in cat->left->lastpos
3041          to each position in cat->right->firstpos. */
3042       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3043                                transitions, counts, offs);
3044       if (errcode != REG_OK)
3045         return errcode;
3046       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3047       if (errcode != REG_OK)
3048         return errcode;
3049       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3050       break;
3051
3052     case ITERATION:
3053       iter = (tre_iteration_t *)node->obj;
3054       assert(iter->max == -1 || iter->max == 1);
3055
3056       if (iter->max == -1)
3057         {
3058           assert(iter->min == 0 || iter->min == 1);
3059           /* Add a transition from each last position in the iterated
3060              expression to each first position. */
3061           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3062                                    transitions, counts, offs);
3063           if (errcode != REG_OK)
3064             return errcode;
3065         }
3066       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3067       break;
3068     }
3069   return errcode;
3070 }
3071
3072
3073 #define ERROR_EXIT(err)           \
3074   do                              \
3075     {                             \
3076       errcode = err;              \
3077       if (/*CONSTCOND*/1)         \
3078         goto error_exit;          \
3079     }                             \
3080  while (/*CONSTCOND*/0)
3081
3082
3083 int
3084 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
3085 {
3086   tre_stack_t *stack;
3087   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3088   tre_pos_and_tags_t *p;
3089   int *counts = NULL, *offs = NULL;
3090   int i, add = 0;
3091   tre_tnfa_transition_t *transitions, *initial;
3092   tre_tnfa_t *tnfa = NULL;
3093   tre_submatch_data_t *submatch_data;
3094   tre_tag_direction_t *tag_directions = NULL;
3095   reg_errcode_t errcode;
3096   tre_mem_t mem;
3097
3098   /* Parse context. */
3099   tre_parse_ctx_t parse_ctx;
3100
3101   /* Allocate a stack used throughout the compilation process for various
3102      purposes. */
3103   stack = tre_stack_new(512, 10240, 128);
3104   if (!stack)
3105     return REG_ESPACE;
3106   /* Allocate a fast memory allocator. */
3107   mem = tre_mem_new();
3108   if (!mem)
3109     {
3110       tre_stack_destroy(stack);
3111       return REG_ESPACE;
3112     }
3113
3114   /* Parse the regexp. */
3115   memset(&parse_ctx, 0, sizeof(parse_ctx));
3116   parse_ctx.mem = mem;
3117   parse_ctx.stack = stack;
3118   parse_ctx.re = regex;
3119   parse_ctx.cflags = cflags;
3120   parse_ctx.max_backref = -1;
3121   errcode = tre_parse(&parse_ctx);
3122   if (errcode != REG_OK)
3123     ERROR_EXIT(errcode);
3124   preg->re_nsub = parse_ctx.submatch_id - 1;
3125   tree = parse_ctx.result;
3126
3127   /* Back references and approximate matching cannot currently be used
3128      in the same regexp. */
3129   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3130     ERROR_EXIT(REG_BADPAT);
3131
3132 #ifdef TRE_DEBUG
3133   tre_ast_print(tree);
3134 #endif /* TRE_DEBUG */
3135
3136   /* Referring to nonexistent subexpressions is illegal. */
3137   if (parse_ctx.max_backref > (int)preg->re_nsub)
3138     ERROR_EXIT(REG_ESUBREG);
3139
3140   /* Allocate the TNFA struct. */
3141   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3142   if (tnfa == NULL)
3143     ERROR_EXIT(REG_ESPACE);
3144   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3145   tnfa->have_approx = parse_ctx.have_approx;
3146   tnfa->num_submatches = parse_ctx.submatch_id;
3147
3148   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
3149      regexp does not have back references, this can be skipped. */
3150   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3151     {
3152
3153       /* Figure out how many tags we will need. */
3154       errcode = tre_add_tags(NULL, stack, tree, tnfa);
3155       if (errcode != REG_OK)
3156         ERROR_EXIT(errcode);
3157
3158       if (tnfa->num_tags > 0)
3159         {
3160           tag_directions = xmalloc(sizeof(*tag_directions)
3161                                    * (tnfa->num_tags + 1));
3162           if (tag_directions == NULL)
3163             ERROR_EXIT(REG_ESPACE);
3164           tnfa->tag_directions = tag_directions;
3165           memset(tag_directions, -1,
3166                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
3167         }
3168       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3169                                    sizeof(*tnfa->minimal_tags));
3170       if (tnfa->minimal_tags == NULL)
3171         ERROR_EXIT(REG_ESPACE);
3172
3173       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3174                               sizeof(*submatch_data));
3175       if (submatch_data == NULL)
3176         ERROR_EXIT(REG_ESPACE);
3177       tnfa->submatch_data = submatch_data;
3178
3179       errcode = tre_add_tags(mem, stack, tree, tnfa);
3180       if (errcode != REG_OK)
3181         ERROR_EXIT(errcode);
3182
3183     }
3184
3185   /* Expand iteration nodes. */
3186   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3187                            tag_directions);
3188   if (errcode != REG_OK)
3189     ERROR_EXIT(errcode);
3190
3191   /* Add a dummy node for the final state.
3192      XXX - For certain patterns this dummy node can be optimized away,
3193            for example "a*" or "ab*".   Figure out a simple way to detect
3194            this possibility. */
3195   tmp_ast_l = tree;
3196   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3197   if (tmp_ast_r == NULL)
3198     ERROR_EXIT(REG_ESPACE);
3199
3200   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3201   if (tree == NULL)
3202     ERROR_EXIT(REG_ESPACE);
3203
3204   errcode = tre_compute_nfl(mem, stack, tree);
3205   if (errcode != REG_OK)
3206     ERROR_EXIT(errcode);
3207
3208   counts = xmalloc(sizeof(int) * parse_ctx.position);
3209   if (counts == NULL)
3210     ERROR_EXIT(REG_ESPACE);
3211
3212   offs = xmalloc(sizeof(int) * parse_ctx.position);
3213   if (offs == NULL)
3214     ERROR_EXIT(REG_ESPACE);
3215
3216   for (i = 0; i < parse_ctx.position; i++)
3217     counts[i] = 0;
3218   tre_ast_to_tnfa(tree, NULL, counts, NULL);
3219
3220   add = 0;
3221   for (i = 0; i < parse_ctx.position; i++)
3222     {
3223       offs[i] = add;
3224       add += counts[i] + 1;
3225       counts[i] = 0;
3226     }
3227   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3228   if (transitions == NULL)
3229     ERROR_EXIT(REG_ESPACE);
3230   tnfa->transitions = transitions;
3231   tnfa->num_transitions = add;
3232
3233   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3234   if (errcode != REG_OK)
3235     ERROR_EXIT(errcode);
3236
3237   tnfa->firstpos_chars = NULL;
3238
3239   p = tree->firstpos;
3240   i = 0;
3241   while (p->position >= 0)
3242     {
3243       i++;
3244       p++;
3245     }
3246
3247   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3248   if (initial == NULL)
3249     ERROR_EXIT(REG_ESPACE);
3250   tnfa->initial = initial;
3251
3252   i = 0;
3253   for (p = tree->firstpos; p->position >= 0; p++)
3254     {
3255       initial[i].state = transitions + offs[p->position];
3256       initial[i].state_id = p->position;
3257       initial[i].tags = NULL;
3258       /* Copy the arrays p->tags, and p->params, they are allocated
3259          from a tre_mem object. */
3260       if (p->tags)
3261         {
3262           int j;
3263           for (j = 0; p->tags[j] >= 0; j++);
3264           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3265           if (!initial[i].tags)
3266             ERROR_EXIT(REG_ESPACE);
3267           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3268         }
3269       initial[i].assertions = p->assertions;
3270       i++;
3271     }
3272   initial[i].state = NULL;
3273
3274   tnfa->num_transitions = add;
3275   tnfa->final = transitions + offs[tree->lastpos[0].position];
3276   tnfa->num_states = parse_ctx.position;
3277   tnfa->cflags = cflags;
3278
3279   tre_mem_destroy(mem);
3280   tre_stack_destroy(stack);
3281   xfree(counts);
3282   xfree(offs);
3283
3284   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3285   return REG_OK;
3286
3287  error_exit:
3288   /* Free everything that was allocated and return the error code. */
3289   tre_mem_destroy(mem);
3290   if (stack != NULL)
3291     tre_stack_destroy(stack);
3292   if (counts != NULL)
3293     xfree(counts);
3294   if (offs != NULL)
3295     xfree(offs);
3296   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3297   regfree(preg);
3298   return errcode;
3299 }
3300
3301
3302
3303
3304 void
3305 regfree(regex_t *preg)
3306 {
3307   tre_tnfa_t *tnfa;
3308   unsigned int i;
3309   tre_tnfa_transition_t *trans;
3310
3311   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3312   if (!tnfa)
3313     return;
3314
3315   for (i = 0; i < tnfa->num_transitions; i++)
3316     if (tnfa->transitions[i].state)
3317       {
3318         if (tnfa->transitions[i].tags)
3319           xfree(tnfa->transitions[i].tags);
3320         if (tnfa->transitions[i].neg_classes)
3321           xfree(tnfa->transitions[i].neg_classes);
3322       }
3323   if (tnfa->transitions)
3324     xfree(tnfa->transitions);
3325
3326   if (tnfa->initial)
3327     {
3328       for (trans = tnfa->initial; trans->state; trans++)
3329         {
3330           if (trans->tags)
3331             xfree(trans->tags);
3332         }
3333       xfree(tnfa->initial);
3334     }
3335
3336   if (tnfa->submatch_data)
3337     {
3338       for (i = 0; i < tnfa->num_submatches; i++)
3339         if (tnfa->submatch_data[i].parents)
3340           xfree(tnfa->submatch_data[i].parents);
3341       xfree(tnfa->submatch_data);
3342     }
3343
3344   if (tnfa->tag_directions)
3345     xfree(tnfa->tag_directions);
3346   if (tnfa->firstpos_chars)
3347     xfree(tnfa->firstpos_chars);
3348   if (tnfa->minimal_tags)
3349     xfree(tnfa->minimal_tags);
3350   xfree(tnfa);
3351 }