revert regex "cleanup" that seems unjustified and may break backtracking
[musl] / src / regex / regexec.c
1 /*
2   regexec.c - TRE POSIX compatible matching functions (and more).
3
4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5   All rights reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37
38 #include <regex.h>
39
40 #include "tre.h"
41
42 #include <assert.h>
43
44 static void
45 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
46                 const tre_tnfa_t *tnfa, int *tags, int match_eo);
47
48 /***********************************************************************
49  from tre-match-utils.h
50 ***********************************************************************/
51
52 #define GET_NEXT_WCHAR() do {                                                 \
53     prev_c = next_c; pos += pos_add_next;                                     \
54     if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
55         if (pos_add_next < 0) return REG_NOMATCH;                             \
56         else pos_add_next++;                                                  \
57     }                                                                         \
58     str_byte += pos_add_next;                                                 \
59   } while (0)
60
61 #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum(c))
62
63 #define CHECK_ASSERTIONS(assertions)                                          \
64   (((assertions & ASSERT_AT_BOL)                                              \
65     && (pos > 0 || reg_notbol)                                                \
66     && (prev_c != L'\n' || !reg_newline))                                     \
67    || ((assertions & ASSERT_AT_EOL)                                           \
68        && (next_c != L'\0' || reg_noteol)                                     \
69        && (next_c != L'\n' || !reg_newline))                                  \
70    || ((assertions & ASSERT_AT_BOW)                                           \
71        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))                    \
72    || ((assertions & ASSERT_AT_EOW)                                           \
73        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
74    || ((assertions & ASSERT_AT_WB)                                            \
75        && (pos != 0 && next_c != L'\0'                                        \
76            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
77    || ((assertions & ASSERT_AT_WB_NEG)                                        \
78        && (pos == 0 || next_c == L'\0'                                        \
79            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
80
81 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
82   (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
83        && !(tnfa->cflags & REG_ICASE)                                         \
84        && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
85     || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
86         && (tnfa->cflags & REG_ICASE)                                         \
87         && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
88         && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
89     || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
90         && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
91                                       tnfa->cflags & REG_ICASE)))
92
93
94
95
96 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
97 static int
98 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
99               int *t1, int *t2)
100 {
101   int i;
102   for (i = 0; i < num_tags; i++)
103     {
104       if (tag_directions[i] == TRE_TAG_MINIMIZE)
105         {
106           if (t1[i] < t2[i])
107             return 1;
108           if (t1[i] > t2[i])
109             return 0;
110         }
111       else
112         {
113           if (t1[i] > t2[i])
114             return 1;
115           if (t1[i] < t2[i])
116             return 0;
117         }
118     }
119   /*  assert(0);*/
120   return 0;
121 }
122
123 static int
124 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
125 {
126   while (*classes != (tre_ctype_t)0)
127     if ((!icase && tre_isctype(wc, *classes))
128         || (icase && (tre_isctype(tre_toupper(wc), *classes)
129                       || tre_isctype(tre_tolower(wc), *classes))))
130       return 1; /* Match. */
131     else
132       classes++;
133   return 0; /* No match. */
134 }
135
136
137 /***********************************************************************
138  from tre-match-parallel.c
139 ***********************************************************************/
140
141 /*
142   This algorithm searches for matches basically by reading characters
143   in the searched string one by one, starting at the beginning.  All
144   matching paths in the TNFA are traversed in parallel.  When two or
145   more paths reach the same state, exactly one is chosen according to
146   tag ordering rules; if returning submatches is not required it does
147   not matter which path is chosen.
148
149   The worst case time required for finding the leftmost and longest
150   match, or determining that there is no match, is always linearly
151   dependent on the length of the text being searched.
152
153   This algorithm cannot handle TNFAs with back referencing nodes.
154   See `tre-match-backtrack.c'.
155 */
156
157 typedef struct {
158   tre_tnfa_transition_t *state;
159   int *tags;
160 } tre_tnfa_reach_t;
161
162 typedef struct {
163   int pos;
164   int **tags;
165 } tre_reach_pos_t;
166
167
168 static reg_errcode_t
169 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
170                       int *match_tags, int eflags,
171                       int *match_end_ofs)
172 {
173   /* State variables required by GET_NEXT_WCHAR. */
174   tre_char_t prev_c = 0, next_c = 0;
175   const char *str_byte = string;
176   int pos = -1;
177   int pos_add_next = 1;
178 #ifdef TRE_MBSTATE
179   mbstate_t mbstate;
180 #endif /* TRE_MBSTATE */
181   int reg_notbol = eflags & REG_NOTBOL;
182   int reg_noteol = eflags & REG_NOTEOL;
183   int reg_newline = tnfa->cflags & REG_NEWLINE;
184
185   char *buf;
186   tre_tnfa_transition_t *trans_i;
187   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
188   tre_reach_pos_t *reach_pos;
189   int *tag_i;
190   int num_tags, i;
191
192   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
193   int new_match = 0;
194   int *tmp_tags = NULL;
195   int *tmp_iptr;
196
197 #ifdef TRE_MBSTATE
198   memset(&mbstate, '\0', sizeof(mbstate));
199 #endif /* TRE_MBSTATE */
200
201   if (!match_tags)
202     num_tags = 0;
203   else
204     num_tags = tnfa->num_tags;
205
206   /* Allocate memory for temporary data required for matching.  This needs to
207      be done for every matching operation to be thread safe.  This allocates
208      everything in a single large block from the stack frame using alloca()
209      or with malloc() if alloca is unavailable. */
210   {
211     int tbytes, rbytes, pbytes, xbytes, total_bytes;
212     char *tmp_buf;
213     /* Compute the length of the block we need. */
214     tbytes = sizeof(*tmp_tags) * num_tags;
215     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
216     pbytes = sizeof(*reach_pos) * tnfa->num_states;
217     xbytes = sizeof(int) * num_tags;
218     total_bytes =
219       (sizeof(long) - 1) * 4 /* for alignment paddings */
220       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
221
222     /* Allocate the memory. */
223     buf = xmalloc((unsigned)total_bytes);
224     if (buf == NULL)
225       return REG_ESPACE;
226     memset(buf, 0, (size_t)total_bytes);
227
228     /* Get the various pointers within tmp_buf (properly aligned). */
229     tmp_tags = (void *)buf;
230     tmp_buf = buf + tbytes;
231     tmp_buf += ALIGN(tmp_buf, long);
232     reach_next = (void *)tmp_buf;
233     tmp_buf += rbytes;
234     tmp_buf += ALIGN(tmp_buf, long);
235     reach = (void *)tmp_buf;
236     tmp_buf += rbytes;
237     tmp_buf += ALIGN(tmp_buf, long);
238     reach_pos = (void *)tmp_buf;
239     tmp_buf += pbytes;
240     tmp_buf += ALIGN(tmp_buf, long);
241     for (i = 0; i < tnfa->num_states; i++)
242       {
243         reach[i].tags = (void *)tmp_buf;
244         tmp_buf += xbytes;
245         reach_next[i].tags = (void *)tmp_buf;
246         tmp_buf += xbytes;
247       }
248   }
249
250   for (i = 0; i < tnfa->num_states; i++)
251     reach_pos[i].pos = -1;
252
253   GET_NEXT_WCHAR();
254   pos = 0;
255
256   reach_next_i = reach_next;
257   while (1)
258     {
259       /* If no match found yet, add the initial states to `reach_next'. */
260       if (match_eo < 0)
261         {
262           trans_i = tnfa->initial;
263           while (trans_i->state != NULL)
264             {
265               if (reach_pos[trans_i->state_id].pos < pos)
266                 {
267                   if (trans_i->assertions
268                       && CHECK_ASSERTIONS(trans_i->assertions))
269                     {
270                       trans_i++;
271                       continue;
272                     }
273
274                   reach_next_i->state = trans_i->state;
275                   for (i = 0; i < num_tags; i++)
276                     reach_next_i->tags[i] = -1;
277                   tag_i = trans_i->tags;
278                   if (tag_i)
279                     while (*tag_i >= 0)
280                       {
281                         if (*tag_i < num_tags)
282                           reach_next_i->tags[*tag_i] = pos;
283                         tag_i++;
284                       }
285                   if (reach_next_i->state == tnfa->final)
286                     {
287                       match_eo = pos;
288                       new_match = 1;
289                       for (i = 0; i < num_tags; i++)
290                         match_tags[i] = reach_next_i->tags[i];
291                     }
292                   reach_pos[trans_i->state_id].pos = pos;
293                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
294                   reach_next_i++;
295                 }
296               trans_i++;
297             }
298           reach_next_i->state = NULL;
299         }
300       else
301         {
302           if (num_tags == 0 || reach_next_i == reach_next)
303             /* We have found a match. */
304             break;
305         }
306
307       /* Check for end of string. */
308       if (!next_c) break;
309
310       GET_NEXT_WCHAR();
311
312       /* Swap `reach' and `reach_next'. */
313       reach_i = reach;
314       reach = reach_next;
315       reach_next = reach_i;
316
317       /* For each state in `reach', weed out states that don't fulfill the
318          minimal matching conditions. */
319       if (tnfa->num_minimals && new_match)
320         {
321           new_match = 0;
322           reach_next_i = reach_next;
323           for (reach_i = reach; reach_i->state; reach_i++)
324             {
325               int skip = 0;
326               for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
327                 {
328                   int end = tnfa->minimal_tags[i];
329                   int start = tnfa->minimal_tags[i + 1];
330                   if (end >= num_tags)
331                     {
332                       skip = 1;
333                       break;
334                     }
335                   else if (reach_i->tags[start] == match_tags[start]
336                            && reach_i->tags[end] < match_tags[end])
337                     {
338                       skip = 1;
339                       break;
340                     }
341                 }
342               if (!skip)
343                 {
344                   reach_next_i->state = reach_i->state;
345                   tmp_iptr = reach_next_i->tags;
346                   reach_next_i->tags = reach_i->tags;
347                   reach_i->tags = tmp_iptr;
348                   reach_next_i++;
349                 }
350             }
351           reach_next_i->state = NULL;
352
353           /* Swap `reach' and `reach_next'. */
354           reach_i = reach;
355           reach = reach_next;
356           reach_next = reach_i;
357         }
358
359       /* For each state in `reach' see if there is a transition leaving with
360          the current input symbol to a state not yet in `reach_next', and
361          add the destination states to `reach_next'. */
362       reach_next_i = reach_next;
363       for (reach_i = reach; reach_i->state; reach_i++)
364         {
365           for (trans_i = reach_i->state; trans_i->state; trans_i++)
366             {
367               /* Does this transition match the input symbol? */
368               if (trans_i->code_min <= (tre_cint_t)prev_c &&
369                   trans_i->code_max >= (tre_cint_t)prev_c)
370                 {
371                   if (trans_i->assertions
372                       && (CHECK_ASSERTIONS(trans_i->assertions)
373                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
374                     {
375                       continue;
376                     }
377
378                   /* Compute the tags after this transition. */
379                   for (i = 0; i < num_tags; i++)
380                     tmp_tags[i] = reach_i->tags[i];
381                   tag_i = trans_i->tags;
382                   if (tag_i != NULL)
383                     while (*tag_i >= 0)
384                       {
385                         if (*tag_i < num_tags)
386                           tmp_tags[*tag_i] = pos;
387                         tag_i++;
388                       }
389
390                   if (reach_pos[trans_i->state_id].pos < pos)
391                     {
392                       /* Found an unvisited node. */
393                       reach_next_i->state = trans_i->state;
394                       tmp_iptr = reach_next_i->tags;
395                       reach_next_i->tags = tmp_tags;
396                       tmp_tags = tmp_iptr;
397                       reach_pos[trans_i->state_id].pos = pos;
398                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
399
400                       if (reach_next_i->state == tnfa->final
401                           && (match_eo == -1
402                               || (num_tags > 0
403                                   && reach_next_i->tags[0] <= match_tags[0])))
404                         {
405                           match_eo = pos;
406                           new_match = 1;
407                           for (i = 0; i < num_tags; i++)
408                             match_tags[i] = reach_next_i->tags[i];
409                         }
410                       reach_next_i++;
411
412                     }
413                   else
414                     {
415                       assert(reach_pos[trans_i->state_id].pos == pos);
416                       /* Another path has also reached this state.  We choose
417                          the winner by examining the tag values for both
418                          paths. */
419                       if (tre_tag_order(num_tags, tnfa->tag_directions,
420                                         tmp_tags,
421                                         *reach_pos[trans_i->state_id].tags))
422                         {
423                           /* The new path wins. */
424                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
425                           *reach_pos[trans_i->state_id].tags = tmp_tags;
426                           if (trans_i->state == tnfa->final)
427                             {
428                               match_eo = pos;
429                               new_match = 1;
430                               for (i = 0; i < num_tags; i++)
431                                 match_tags[i] = tmp_tags[i];
432                             }
433                           tmp_tags = tmp_iptr;
434                         }
435                     }
436                 }
437             }
438         }
439       reach_next_i->state = NULL;
440     }
441
442   if (buf)
443     xfree(buf);
444
445   *match_end_ofs = match_eo;
446   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
447 }
448
449
450
451 /***********************************************************************
452  from tre-match-backtrack.c
453 ***********************************************************************/
454
455 /*
456   This matcher is for regexps that use back referencing.  Regexp matching
457   with back referencing is an NP-complete problem on the number of back
458   references.  The easiest way to match them is to use a backtracking
459   routine which basically goes through all possible paths in the TNFA
460   and chooses the one which results in the best (leftmost and longest)
461   match.  This can be spectacularly expensive and may run out of stack
462   space, but there really is no better known generic algorithm.  Quoting
463   Henry Spencer from comp.compilers:
464   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
465
466     POSIX.2 REs require longest match, which is really exciting to
467     implement since the obsolete ("basic") variant also includes
468     \<digit>.  I haven't found a better way of tackling this than doing
469     a preliminary match using a DFA (or simulation) on a modified RE
470     that just replicates subREs for \<digit>, and then doing a
471     backtracking match to determine whether the subRE matches were
472     right.  This can be rather slow, but I console myself with the
473     thought that people who use \<digit> deserve very slow execution.
474     (Pun unintentional but very appropriate.)
475
476 */
477
478 typedef struct {
479   int pos;
480   const char *str_byte;
481   tre_tnfa_transition_t *state;
482   int state_id;
483   int next_c;
484   int *tags;
485 #ifdef TRE_MBSTATE
486   mbstate_t mbstate;
487 #endif /* TRE_MBSTATE */
488 } tre_backtrack_item_t;
489
490 typedef struct tre_backtrack_struct {
491   tre_backtrack_item_t item;
492   struct tre_backtrack_struct *prev;
493   struct tre_backtrack_struct *next;
494 } *tre_backtrack_t;
495
496 #ifdef TRE_MBSTATE
497 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
498 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
499 #else /* !TRE_MBSTATE */
500 #define BT_STACK_MBSTATE_IN
501 #define BT_STACK_MBSTATE_OUT
502 #endif /* !TRE_MBSTATE */
503
504 #define tre_bt_mem_new            tre_mem_new
505 #define tre_bt_mem_alloc          tre_mem_alloc
506 #define tre_bt_mem_destroy        tre_mem_destroy
507
508
509 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
510   do                                                                          \
511     {                                                                         \
512       int i;                                                                  \
513       if (!stack->next)                                                       \
514         {                                                                     \
515           tre_backtrack_t s;                                                  \
516           s = tre_bt_mem_alloc(mem, sizeof(*s));                              \
517           if (!s)                                                             \
518             {                                                                 \
519               tre_bt_mem_destroy(mem);                                        \
520               if (tags)                                                       \
521                 xfree(tags);                                                  \
522               if (pmatch)                                                     \
523                 xfree(pmatch);                                                \
524               if (states_seen)                                                \
525                 xfree(states_seen);                                           \
526               return REG_ESPACE;                                              \
527             }                                                                 \
528           s->prev = stack;                                                    \
529           s->next = NULL;                                                     \
530           s->item.tags = tre_bt_mem_alloc(mem,                                \
531                                           sizeof(*tags) * tnfa->num_tags);    \
532           if (!s->item.tags)                                                  \
533             {                                                                 \
534               tre_bt_mem_destroy(mem);                                        \
535               if (tags)                                                       \
536                 xfree(tags);                                                  \
537               if (pmatch)                                                     \
538                 xfree(pmatch);                                                \
539               if (states_seen)                                                \
540                 xfree(states_seen);                                           \
541               return REG_ESPACE;                                              \
542             }                                                                 \
543           stack->next = s;                                                    \
544           stack = s;                                                          \
545         }                                                                     \
546       else                                                                    \
547         stack = stack->next;                                                  \
548       stack->item.pos = (_pos);                                               \
549       stack->item.str_byte = (_str_byte);                                     \
550       stack->item.state = (_state);                                           \
551       stack->item.state_id = (_state_id);                                     \
552       stack->item.next_c = (_next_c);                                         \
553       for (i = 0; i < tnfa->num_tags; i++)                                    \
554         stack->item.tags[i] = (_tags)[i];                                     \
555       BT_STACK_MBSTATE_IN;                                                    \
556     }                                                                         \
557   while (0)
558
559 #define BT_STACK_POP()                                                        \
560   do                                                                          \
561     {                                                                         \
562       int i;                                                                  \
563       assert(stack->prev);                                                    \
564       pos = stack->item.pos;                                                  \
565       str_byte = stack->item.str_byte;                                        \
566       state = stack->item.state;                                              \
567       next_c = stack->item.next_c;                                            \
568       for (i = 0; i < tnfa->num_tags; i++)                                    \
569         tags[i] = stack->item.tags[i];                                        \
570       BT_STACK_MBSTATE_OUT;                                                   \
571       stack = stack->prev;                                                    \
572     }                                                                         \
573   while (0)
574
575 #undef MIN
576 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
577
578 static reg_errcode_t
579 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
580                        int *match_tags, int eflags, int *match_end_ofs)
581 {
582   /* State variables required by GET_NEXT_WCHAR. */
583   tre_char_t prev_c = 0, next_c = 0;
584   const char *str_byte = string;
585   int pos = 0;
586   int pos_add_next = 1;
587 #ifdef TRE_MBSTATE
588   mbstate_t mbstate;
589 #endif /* TRE_MBSTATE */
590   int reg_notbol = eflags & REG_NOTBOL;
591   int reg_noteol = eflags & REG_NOTEOL;
592   int reg_newline = tnfa->cflags & REG_NEWLINE;
593
594   /* These are used to remember the necessary values of the above
595      variables to return to the position where the current search
596      started from. */
597   int next_c_start;
598   const char *str_byte_start;
599   int pos_start = -1;
600 #ifdef TRE_MBSTATE
601   mbstate_t mbstate_start;
602 #endif /* TRE_MBSTATE */
603
604   /* End offset of best match so far, or -1 if no match found yet. */
605   int match_eo = -1;
606   /* Tag arrays. */
607   int *next_tags, *tags = NULL;
608   /* Current TNFA state. */
609   tre_tnfa_transition_t *state;
610   int *states_seen = NULL;
611
612   /* Memory allocator to for allocating the backtracking stack. */
613   tre_mem_t mem = tre_bt_mem_new();
614
615   /* The backtracking stack. */
616   tre_backtrack_t stack;
617
618   tre_tnfa_transition_t *trans_i;
619   regmatch_t *pmatch = NULL;
620   int ret;
621
622 #ifdef TRE_MBSTATE
623   memset(&mbstate, '\0', sizeof(mbstate));
624 #endif /* TRE_MBSTATE */
625
626   if (!mem)
627     return REG_ESPACE;
628   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
629   if (!stack)
630     {
631       ret = REG_ESPACE;
632       goto error_exit;
633     }
634   stack->prev = NULL;
635   stack->next = NULL;
636
637   if (tnfa->num_tags)
638     {
639       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
640       if (!tags)
641         {
642           ret = REG_ESPACE;
643           goto error_exit;
644         }
645     }
646   if (tnfa->num_submatches)
647     {
648       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
649       if (!pmatch)
650         {
651           ret = REG_ESPACE;
652           goto error_exit;
653         }
654     }
655   if (tnfa->num_states)
656     {
657       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
658       if (!states_seen)
659         {
660           ret = REG_ESPACE;
661           goto error_exit;
662         }
663     }
664
665  retry:
666   {
667     int i;
668     for (i = 0; i < tnfa->num_tags; i++)
669       {
670         tags[i] = -1;
671         if (match_tags)
672           match_tags[i] = -1;
673       }
674     for (i = 0; i < tnfa->num_states; i++)
675       states_seen[i] = 0;
676   }
677
678   state = NULL;
679   pos = pos_start;
680   GET_NEXT_WCHAR();
681   pos_start = pos;
682   next_c_start = next_c;
683   str_byte_start = str_byte;
684 #ifdef TRE_MBSTATE
685   mbstate_start = mbstate;
686 #endif /* TRE_MBSTATE */
687
688   /* Handle initial states. */
689   next_tags = NULL;
690   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
691     {
692       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
693         {
694           continue;
695         }
696       if (state == NULL)
697         {
698           /* Start from this state. */
699           state = trans_i->state;
700           next_tags = trans_i->tags;
701         }
702       else
703         {
704           /* Backtrack to this state. */
705           BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
706                         trans_i->state_id, next_c, tags, mbstate);
707           {
708             int *tmp = trans_i->tags;
709             if (tmp)
710               while (*tmp >= 0)
711                 stack->item.tags[*tmp++] = pos;
712           }
713         }
714     }
715
716   if (next_tags)
717     for (; *next_tags >= 0; next_tags++)
718       tags[*next_tags] = pos;
719
720
721   if (state == NULL)
722     goto backtrack;
723
724   while (1)
725     {
726       tre_tnfa_transition_t *next_state;
727       int empty_br_match;
728
729       if (state == tnfa->final)
730         {
731           if (match_eo < pos
732               || (match_eo == pos
733                   && match_tags
734                   && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
735                                    tags, match_tags)))
736             {
737               int i;
738               /* This match wins the previous match. */
739               match_eo = pos;
740               if (match_tags)
741                 for (i = 0; i < tnfa->num_tags; i++)
742                   match_tags[i] = tags[i];
743             }
744           /* Our TNFAs never have transitions leaving from the final state,
745              so we jump right to backtracking. */
746           goto backtrack;
747         }
748
749       /* Go to the next character in the input string. */
750       empty_br_match = 0;
751       trans_i = state;
752       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
753         {
754           /* This is a back reference state.  All transitions leaving from
755              this state have the same back reference "assertion".  Instead
756              of reading the next character, we match the back reference. */
757           int so, eo, bt = trans_i->u.backref;
758           int bt_len;
759           int result;
760
761           /* Get the substring we need to match against.  Remember to
762              turn off REG_NOSUB temporarily. */
763           tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
764                           tnfa, tags, pos);
765           so = pmatch[bt].rm_so;
766           eo = pmatch[bt].rm_eo;
767           bt_len = eo - so;
768
769           result = strncmp((const char*)string + so, str_byte - 1,
770                                  (size_t)bt_len);
771
772           if (result == 0)
773             {
774               /* Back reference matched.  Check for infinite loop. */
775               if (bt_len == 0)
776                 empty_br_match = 1;
777               if (empty_br_match && states_seen[trans_i->state_id])
778                 {
779                   goto backtrack;
780                 }
781
782               states_seen[trans_i->state_id] = empty_br_match;
783
784               /* Advance in input string and resync `prev_c', `next_c'
785                  and pos. */
786               str_byte += bt_len - 1;
787               pos += bt_len - 1;
788               GET_NEXT_WCHAR();
789             }
790           else
791             {
792               goto backtrack;
793             }
794         }
795       else
796         {
797           /* Check for end of string. */
798           if (next_c == L'\0')
799                 goto backtrack;
800
801           /* Read the next character. */
802           GET_NEXT_WCHAR();
803         }
804
805       next_state = NULL;
806       for (trans_i = state; trans_i->state; trans_i++)
807         {
808           if (trans_i->code_min <= (tre_cint_t)prev_c
809               && trans_i->code_max >= (tre_cint_t)prev_c)
810             {
811               if (trans_i->assertions
812                   && (CHECK_ASSERTIONS(trans_i->assertions)
813                       || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
814                 {
815                   continue;
816                 }
817
818               if (next_state == NULL)
819                 {
820                   /* First matching transition. */
821                   next_state = trans_i->state;
822                   next_tags = trans_i->tags;
823                 }
824               else
825                 {
826                   /* Second matching transition.  We may need to backtrack here
827                      to take this transition instead of the first one, so we
828                      push this transition in the backtracking stack so we can
829                      jump back here if needed. */
830                   BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
831                                 trans_i->state_id, next_c, tags, mbstate);
832                   {
833                     int *tmp;
834                     for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
835                       stack->item.tags[*tmp] = pos;
836                   }
837 #if 0 /* XXX - it's important not to look at all transitions here to keep
838          the stack small! */
839                   break;
840 #endif
841                 }
842             }
843         }
844
845       if (next_state != NULL)
846         {
847           /* Matching transitions were found.  Take the first one. */
848           state = next_state;
849
850           /* Update the tag values. */
851           if (next_tags)
852             while (*next_tags >= 0)
853               tags[*next_tags++] = pos;
854         }
855       else
856         {
857         backtrack:
858           /* A matching transition was not found.  Try to backtrack. */
859           if (stack->prev)
860             {
861               if (stack->item.state->assertions & ASSERT_BACKREF)
862                 {
863                   states_seen[stack->item.state_id] = 0;
864                 }
865
866               BT_STACK_POP();
867             }
868           else if (match_eo < 0)
869             {
870               /* Try starting from a later position in the input string. */
871               /* Check for end of string. */
872               if (next_c == L'\0')
873                     {
874                       break;
875                     }
876               next_c = next_c_start;
877 #ifdef TRE_MBSTATE
878               mbstate = mbstate_start;
879 #endif /* TRE_MBSTATE */
880               str_byte = str_byte_start;
881               goto retry;
882             }
883           else
884             {
885               break;
886             }
887         }
888     }
889
890   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
891   *match_end_ofs = match_eo;
892
893  error_exit:
894   tre_bt_mem_destroy(mem);
895 #ifndef TRE_USE_ALLOCA
896   if (tags)
897     xfree(tags);
898   if (pmatch)
899     xfree(pmatch);
900   if (states_seen)
901     xfree(states_seen);
902 #endif /* !TRE_USE_ALLOCA */
903
904   return ret;
905 }
906
907 /***********************************************************************
908  from regexec.c
909 ***********************************************************************/
910
911 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
912    endpoint values. */
913 static void
914 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
915                 const tre_tnfa_t *tnfa, int *tags, int match_eo)
916 {
917   tre_submatch_data_t *submatch_data;
918   unsigned int i, j;
919   int *parents;
920
921   i = 0;
922   if (match_eo >= 0 && !(cflags & REG_NOSUB))
923     {
924       /* Construct submatch offsets from the tags. */
925       submatch_data = tnfa->submatch_data;
926       while (i < tnfa->num_submatches && i < nmatch)
927         {
928           if (submatch_data[i].so_tag == tnfa->end_tag)
929             pmatch[i].rm_so = match_eo;
930           else
931             pmatch[i].rm_so = tags[submatch_data[i].so_tag];
932
933           if (submatch_data[i].eo_tag == tnfa->end_tag)
934             pmatch[i].rm_eo = match_eo;
935           else
936             pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
937
938           /* If either of the endpoints were not used, this submatch
939              was not part of the match. */
940           if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
941             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
942
943           i++;
944         }
945       /* Reset all submatches that are not within all of their parent
946          submatches. */
947       i = 0;
948       while (i < tnfa->num_submatches && i < nmatch)
949         {
950           if (pmatch[i].rm_eo == -1)
951             assert(pmatch[i].rm_so == -1);
952           assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
953
954           parents = submatch_data[i].parents;
955           if (parents != NULL)
956             for (j = 0; parents[j] >= 0; j++)
957               {
958                 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
959                     || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
960                   pmatch[i].rm_so = pmatch[i].rm_eo = -1;
961               }
962           i++;
963         }
964     }
965
966   while (i < nmatch)
967     {
968       pmatch[i].rm_so = -1;
969       pmatch[i].rm_eo = -1;
970       i++;
971     }
972 }
973
974
975 /*
976   Wrapper functions for POSIX compatible regexp matching.
977 */
978
979 int
980 regexec(const regex_t *restrict preg, const char *restrict string,
981           size_t nmatch, regmatch_t pmatch[restrict], int eflags)
982 {
983   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
984   reg_errcode_t status;
985   int *tags = NULL, eo;
986   if (tnfa->num_tags > 0 && nmatch > 0)
987     {
988       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
989       if (tags == NULL)
990         return REG_ESPACE;
991     }
992
993   /* Dispatch to the appropriate matcher. */
994   if (tnfa->have_backrefs)
995     {
996       /* The regex has back references, use the backtracking matcher. */
997       status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
998     }
999   else
1000     {
1001       /* Exact matching, no back references, use the parallel matcher. */
1002       status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1003     }
1004
1005   if (status == REG_OK)
1006     /* A match was found, so fill the submatch registers. */
1007     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1008   if (tags)
1009     xfree(tags);
1010   return status;
1011 }