5c4cb9224e7bd7709152f1457723518962801b7b
[musl] / src / regex / regexec.c
1 /*
2   regexec.c - TRE POSIX compatible matching functions (and more).
3
4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5   All rights reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37 #include <stdint.h>
38
39 #include <regex.h>
40
41 #include "tre.h"
42
43 #include <assert.h>
44
45 static void
46 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47                 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48
49 /***********************************************************************
50  from tre-match-utils.h
51 ***********************************************************************/
52
53 #define GET_NEXT_WCHAR() do {                                                 \
54     prev_c = next_c; pos += pos_add_next;                                     \
55     if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
56         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
57         else pos_add_next++;                                                  \
58     }                                                                         \
59     str_byte += pos_add_next;                                                 \
60   } while (0)
61
62 #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum(c))
63
64 #define CHECK_ASSERTIONS(assertions)                                          \
65   (((assertions & ASSERT_AT_BOL)                                              \
66     && (pos > 0 || reg_notbol)                                                \
67     && (prev_c != L'\n' || !reg_newline))                                     \
68    || ((assertions & ASSERT_AT_EOL)                                           \
69        && (next_c != L'\0' || reg_noteol)                                     \
70        && (next_c != L'\n' || !reg_newline))                                  \
71    || ((assertions & ASSERT_AT_BOW)                                           \
72        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))                    \
73    || ((assertions & ASSERT_AT_EOW)                                           \
74        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
75    || ((assertions & ASSERT_AT_WB)                                            \
76        && (pos != 0 && next_c != L'\0'                                        \
77            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
78    || ((assertions & ASSERT_AT_WB_NEG)                                        \
79        && (pos == 0 || next_c == L'\0'                                        \
80            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81
82 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
83   (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
84        && !(tnfa->cflags & REG_ICASE)                                         \
85        && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
86     || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
87         && (tnfa->cflags & REG_ICASE)                                         \
88         && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
89         && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
90     || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
91         && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92                                       tnfa->cflags & REG_ICASE)))
93
94
95
96
97 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
98 static int
99 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100               regoff_t *t1, regoff_t *t2)
101 {
102   int i;
103   for (i = 0; i < num_tags; i++)
104     {
105       if (tag_directions[i] == TRE_TAG_MINIMIZE)
106         {
107           if (t1[i] < t2[i])
108             return 1;
109           if (t1[i] > t2[i])
110             return 0;
111         }
112       else
113         {
114           if (t1[i] > t2[i])
115             return 1;
116           if (t1[i] < t2[i])
117             return 0;
118         }
119     }
120   /*  assert(0);*/
121   return 0;
122 }
123
124 static int
125 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126 {
127   while (*classes != (tre_ctype_t)0)
128     if ((!icase && tre_isctype(wc, *classes))
129         || (icase && (tre_isctype(tre_toupper(wc), *classes)
130                       || tre_isctype(tre_tolower(wc), *classes))))
131       return 1; /* Match. */
132     else
133       classes++;
134   return 0; /* No match. */
135 }
136
137
138 /***********************************************************************
139  from tre-match-parallel.c
140 ***********************************************************************/
141
142 /*
143   This algorithm searches for matches basically by reading characters
144   in the searched string one by one, starting at the beginning.  All
145   matching paths in the TNFA are traversed in parallel.  When two or
146   more paths reach the same state, exactly one is chosen according to
147   tag ordering rules; if returning submatches is not required it does
148   not matter which path is chosen.
149
150   The worst case time required for finding the leftmost and longest
151   match, or determining that there is no match, is always linearly
152   dependent on the length of the text being searched.
153
154   This algorithm cannot handle TNFAs with back referencing nodes.
155   See `tre-match-backtrack.c'.
156 */
157
158 typedef struct {
159   tre_tnfa_transition_t *state;
160   regoff_t *tags;
161 } tre_tnfa_reach_t;
162
163 typedef struct {
164   regoff_t pos;
165   regoff_t **tags;
166 } tre_reach_pos_t;
167
168
169 static reg_errcode_t
170 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171                       regoff_t *match_tags, int eflags,
172                       regoff_t *match_end_ofs)
173 {
174   /* State variables required by GET_NEXT_WCHAR. */
175   tre_char_t prev_c = 0, next_c = 0;
176   const char *str_byte = string;
177   regoff_t pos = -1;
178   regoff_t pos_add_next = 1;
179 #ifdef TRE_MBSTATE
180   mbstate_t mbstate;
181 #endif /* TRE_MBSTATE */
182   int reg_notbol = eflags & REG_NOTBOL;
183   int reg_noteol = eflags & REG_NOTEOL;
184   int reg_newline = tnfa->cflags & REG_NEWLINE;
185   reg_errcode_t ret;
186
187   char *buf;
188   tre_tnfa_transition_t *trans_i;
189   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190   tre_reach_pos_t *reach_pos;
191   int *tag_i;
192   int num_tags, i;
193
194   regoff_t match_eo = -1;          /* end offset of match (-1 if no match found yet) */
195   int new_match = 0;
196   regoff_t *tmp_tags = NULL;
197   regoff_t *tmp_iptr;
198
199 #ifdef TRE_MBSTATE
200   memset(&mbstate, '\0', sizeof(mbstate));
201 #endif /* TRE_MBSTATE */
202
203   if (!match_tags)
204     num_tags = 0;
205   else
206     num_tags = tnfa->num_tags;
207
208   /* Allocate memory for temporary data required for matching.  This needs to
209      be done for every matching operation to be thread safe.  This allocates
210      everything in a single large block with calloc(). */
211   {
212     size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213     char *tmp_buf;
214
215     /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216      * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217     if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218       goto error_exit;
219
220     /* Likewise check rbytes. */
221     if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222       goto error_exit;
223
224     /* Likewise check pbytes. */
225     if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226       goto error_exit;
227
228     /* Compute the length of the block we need. */
229     tbytes = sizeof(*tmp_tags) * num_tags;
230     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231     pbytes = sizeof(*reach_pos) * tnfa->num_states;
232     xbytes = sizeof(regoff_t) * num_tags;
233     total_bytes =
234       (sizeof(long) - 1) * 4 /* for alignment paddings */
235       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236
237     /* Allocate the memory. */
238     buf = calloc(total_bytes, 1);
239     if (buf == NULL)
240       return REG_ESPACE;
241
242     /* Get the various pointers within tmp_buf (properly aligned). */
243     tmp_tags = (void *)buf;
244     tmp_buf = buf + tbytes;
245     tmp_buf += ALIGN(tmp_buf, long);
246     reach_next = (void *)tmp_buf;
247     tmp_buf += rbytes;
248     tmp_buf += ALIGN(tmp_buf, long);
249     reach = (void *)tmp_buf;
250     tmp_buf += rbytes;
251     tmp_buf += ALIGN(tmp_buf, long);
252     reach_pos = (void *)tmp_buf;
253     tmp_buf += pbytes;
254     tmp_buf += ALIGN(tmp_buf, long);
255     for (i = 0; i < tnfa->num_states; i++)
256       {
257         reach[i].tags = (void *)tmp_buf;
258         tmp_buf += xbytes;
259         reach_next[i].tags = (void *)tmp_buf;
260         tmp_buf += xbytes;
261       }
262   }
263
264   for (i = 0; i < tnfa->num_states; i++)
265     reach_pos[i].pos = -1;
266
267   GET_NEXT_WCHAR();
268   pos = 0;
269
270   reach_next_i = reach_next;
271   while (1)
272     {
273       /* If no match found yet, add the initial states to `reach_next'. */
274       if (match_eo < 0)
275         {
276           trans_i = tnfa->initial;
277           while (trans_i->state != NULL)
278             {
279               if (reach_pos[trans_i->state_id].pos < pos)
280                 {
281                   if (trans_i->assertions
282                       && CHECK_ASSERTIONS(trans_i->assertions))
283                     {
284                       trans_i++;
285                       continue;
286                     }
287
288                   reach_next_i->state = trans_i->state;
289                   for (i = 0; i < num_tags; i++)
290                     reach_next_i->tags[i] = -1;
291                   tag_i = trans_i->tags;
292                   if (tag_i)
293                     while (*tag_i >= 0)
294                       {
295                         if (*tag_i < num_tags)
296                           reach_next_i->tags[*tag_i] = pos;
297                         tag_i++;
298                       }
299                   if (reach_next_i->state == tnfa->final)
300                     {
301                       match_eo = pos;
302                       new_match = 1;
303                       for (i = 0; i < num_tags; i++)
304                         match_tags[i] = reach_next_i->tags[i];
305                     }
306                   reach_pos[trans_i->state_id].pos = pos;
307                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308                   reach_next_i++;
309                 }
310               trans_i++;
311             }
312           reach_next_i->state = NULL;
313         }
314       else
315         {
316           if (num_tags == 0 || reach_next_i == reach_next)
317             /* We have found a match. */
318             break;
319         }
320
321       /* Check for end of string. */
322       if (!next_c) break;
323
324       GET_NEXT_WCHAR();
325
326       /* Swap `reach' and `reach_next'. */
327       reach_i = reach;
328       reach = reach_next;
329       reach_next = reach_i;
330
331       /* For each state in `reach', weed out states that don't fulfill the
332          minimal matching conditions. */
333       if (tnfa->num_minimals && new_match)
334         {
335           new_match = 0;
336           reach_next_i = reach_next;
337           for (reach_i = reach; reach_i->state; reach_i++)
338             {
339               int skip = 0;
340               for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341                 {
342                   int end = tnfa->minimal_tags[i];
343                   int start = tnfa->minimal_tags[i + 1];
344                   if (end >= num_tags)
345                     {
346                       skip = 1;
347                       break;
348                     }
349                   else if (reach_i->tags[start] == match_tags[start]
350                            && reach_i->tags[end] < match_tags[end])
351                     {
352                       skip = 1;
353                       break;
354                     }
355                 }
356               if (!skip)
357                 {
358                   reach_next_i->state = reach_i->state;
359                   tmp_iptr = reach_next_i->tags;
360                   reach_next_i->tags = reach_i->tags;
361                   reach_i->tags = tmp_iptr;
362                   reach_next_i++;
363                 }
364             }
365           reach_next_i->state = NULL;
366
367           /* Swap `reach' and `reach_next'. */
368           reach_i = reach;
369           reach = reach_next;
370           reach_next = reach_i;
371         }
372
373       /* For each state in `reach' see if there is a transition leaving with
374          the current input symbol to a state not yet in `reach_next', and
375          add the destination states to `reach_next'. */
376       reach_next_i = reach_next;
377       for (reach_i = reach; reach_i->state; reach_i++)
378         {
379           for (trans_i = reach_i->state; trans_i->state; trans_i++)
380             {
381               /* Does this transition match the input symbol? */
382               if (trans_i->code_min <= (tre_cint_t)prev_c &&
383                   trans_i->code_max >= (tre_cint_t)prev_c)
384                 {
385                   if (trans_i->assertions
386                       && (CHECK_ASSERTIONS(trans_i->assertions)
387                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388                     {
389                       continue;
390                     }
391
392                   /* Compute the tags after this transition. */
393                   for (i = 0; i < num_tags; i++)
394                     tmp_tags[i] = reach_i->tags[i];
395                   tag_i = trans_i->tags;
396                   if (tag_i != NULL)
397                     while (*tag_i >= 0)
398                       {
399                         if (*tag_i < num_tags)
400                           tmp_tags[*tag_i] = pos;
401                         tag_i++;
402                       }
403
404                   if (reach_pos[trans_i->state_id].pos < pos)
405                     {
406                       /* Found an unvisited node. */
407                       reach_next_i->state = trans_i->state;
408                       tmp_iptr = reach_next_i->tags;
409                       reach_next_i->tags = tmp_tags;
410                       tmp_tags = tmp_iptr;
411                       reach_pos[trans_i->state_id].pos = pos;
412                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413
414                       if (reach_next_i->state == tnfa->final
415                           && (match_eo == -1
416                               || (num_tags > 0
417                                   && reach_next_i->tags[0] <= match_tags[0])))
418                         {
419                           match_eo = pos;
420                           new_match = 1;
421                           for (i = 0; i < num_tags; i++)
422                             match_tags[i] = reach_next_i->tags[i];
423                         }
424                       reach_next_i++;
425
426                     }
427                   else
428                     {
429                       assert(reach_pos[trans_i->state_id].pos == pos);
430                       /* Another path has also reached this state.  We choose
431                          the winner by examining the tag values for both
432                          paths. */
433                       if (tre_tag_order(num_tags, tnfa->tag_directions,
434                                         tmp_tags,
435                                         *reach_pos[trans_i->state_id].tags))
436                         {
437                           /* The new path wins. */
438                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
439                           *reach_pos[trans_i->state_id].tags = tmp_tags;
440                           if (trans_i->state == tnfa->final)
441                             {
442                               match_eo = pos;
443                               new_match = 1;
444                               for (i = 0; i < num_tags; i++)
445                                 match_tags[i] = tmp_tags[i];
446                             }
447                           tmp_tags = tmp_iptr;
448                         }
449                     }
450                 }
451             }
452         }
453       reach_next_i->state = NULL;
454     }
455
456   *match_end_ofs = match_eo;
457   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458 error_exit:
459   xfree(buf);
460   return ret;
461 }
462
463
464
465 /***********************************************************************
466  from tre-match-backtrack.c
467 ***********************************************************************/
468
469 /*
470   This matcher is for regexps that use back referencing.  Regexp matching
471   with back referencing is an NP-complete problem on the number of back
472   references.  The easiest way to match them is to use a backtracking
473   routine which basically goes through all possible paths in the TNFA
474   and chooses the one which results in the best (leftmost and longest)
475   match.  This can be spectacularly expensive and may run out of stack
476   space, but there really is no better known generic algorithm.  Quoting
477   Henry Spencer from comp.compilers:
478   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479
480     POSIX.2 REs require longest match, which is really exciting to
481     implement since the obsolete ("basic") variant also includes
482     \<digit>.  I haven't found a better way of tackling this than doing
483     a preliminary match using a DFA (or simulation) on a modified RE
484     that just replicates subREs for \<digit>, and then doing a
485     backtracking match to determine whether the subRE matches were
486     right.  This can be rather slow, but I console myself with the
487     thought that people who use \<digit> deserve very slow execution.
488     (Pun unintentional but very appropriate.)
489
490 */
491
492 typedef struct {
493   regoff_t pos;
494   const char *str_byte;
495   tre_tnfa_transition_t *state;
496   int state_id;
497   int next_c;
498   regoff_t *tags;
499 #ifdef TRE_MBSTATE
500   mbstate_t mbstate;
501 #endif /* TRE_MBSTATE */
502 } tre_backtrack_item_t;
503
504 typedef struct tre_backtrack_struct {
505   tre_backtrack_item_t item;
506   struct tre_backtrack_struct *prev;
507   struct tre_backtrack_struct *next;
508 } *tre_backtrack_t;
509
510 #ifdef TRE_MBSTATE
511 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
512 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513 #else /* !TRE_MBSTATE */
514 #define BT_STACK_MBSTATE_IN
515 #define BT_STACK_MBSTATE_OUT
516 #endif /* !TRE_MBSTATE */
517
518 #define tre_bt_mem_new            tre_mem_new
519 #define tre_bt_mem_alloc          tre_mem_alloc
520 #define tre_bt_mem_destroy        tre_mem_destroy
521
522
523 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524   do                                                                          \
525     {                                                                         \
526       int i;                                                                  \
527       if (!stack->next)                                                       \
528         {                                                                     \
529           tre_backtrack_t s;                                                  \
530           s = tre_bt_mem_alloc(mem, sizeof(*s));                              \
531           if (!s)                                                             \
532             {                                                                 \
533               tre_bt_mem_destroy(mem);                                        \
534               if (tags)                                                       \
535                 xfree(tags);                                                  \
536               if (pmatch)                                                     \
537                 xfree(pmatch);                                                \
538               if (states_seen)                                                \
539                 xfree(states_seen);                                           \
540               return REG_ESPACE;                                              \
541             }                                                                 \
542           s->prev = stack;                                                    \
543           s->next = NULL;                                                     \
544           s->item.tags = tre_bt_mem_alloc(mem,                                \
545                                           sizeof(*tags) * tnfa->num_tags);    \
546           if (!s->item.tags)                                                  \
547             {                                                                 \
548               tre_bt_mem_destroy(mem);                                        \
549               if (tags)                                                       \
550                 xfree(tags);                                                  \
551               if (pmatch)                                                     \
552                 xfree(pmatch);                                                \
553               if (states_seen)                                                \
554                 xfree(states_seen);                                           \
555               return REG_ESPACE;                                              \
556             }                                                                 \
557           stack->next = s;                                                    \
558           stack = s;                                                          \
559         }                                                                     \
560       else                                                                    \
561         stack = stack->next;                                                  \
562       stack->item.pos = (_pos);                                               \
563       stack->item.str_byte = (_str_byte);                                     \
564       stack->item.state = (_state);                                           \
565       stack->item.state_id = (_state_id);                                     \
566       stack->item.next_c = (_next_c);                                         \
567       for (i = 0; i < tnfa->num_tags; i++)                                    \
568         stack->item.tags[i] = (_tags)[i];                                     \
569       BT_STACK_MBSTATE_IN;                                                    \
570     }                                                                         \
571   while (0)
572
573 #define BT_STACK_POP()                                                        \
574   do                                                                          \
575     {                                                                         \
576       int i;                                                                  \
577       assert(stack->prev);                                                    \
578       pos = stack->item.pos;                                                  \
579       str_byte = stack->item.str_byte;                                        \
580       state = stack->item.state;                                              \
581       next_c = stack->item.next_c;                                            \
582       for (i = 0; i < tnfa->num_tags; i++)                                    \
583         tags[i] = stack->item.tags[i];                                        \
584       BT_STACK_MBSTATE_OUT;                                                   \
585       stack = stack->prev;                                                    \
586     }                                                                         \
587   while (0)
588
589 #undef MIN
590 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
591
592 static reg_errcode_t
593 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
594                        regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
595 {
596   /* State variables required by GET_NEXT_WCHAR. */
597   tre_char_t prev_c = 0, next_c = 0;
598   const char *str_byte = string;
599   regoff_t pos = 0;
600   regoff_t pos_add_next = 1;
601 #ifdef TRE_MBSTATE
602   mbstate_t mbstate;
603 #endif /* TRE_MBSTATE */
604   int reg_notbol = eflags & REG_NOTBOL;
605   int reg_noteol = eflags & REG_NOTEOL;
606   int reg_newline = tnfa->cflags & REG_NEWLINE;
607
608   /* These are used to remember the necessary values of the above
609      variables to return to the position where the current search
610      started from. */
611   int next_c_start;
612   const char *str_byte_start;
613   regoff_t pos_start = -1;
614 #ifdef TRE_MBSTATE
615   mbstate_t mbstate_start;
616 #endif /* TRE_MBSTATE */
617
618   /* End offset of best match so far, or -1 if no match found yet. */
619   regoff_t match_eo = -1;
620   /* Tag arrays. */
621   int *next_tags;
622   regoff_t *tags = NULL;
623   /* Current TNFA state. */
624   tre_tnfa_transition_t *state;
625   int *states_seen = NULL;
626
627   /* Memory allocator to for allocating the backtracking stack. */
628   tre_mem_t mem = tre_bt_mem_new();
629
630   /* The backtracking stack. */
631   tre_backtrack_t stack;
632
633   tre_tnfa_transition_t *trans_i;
634   regmatch_t *pmatch = NULL;
635   int ret;
636
637 #ifdef TRE_MBSTATE
638   memset(&mbstate, '\0', sizeof(mbstate));
639 #endif /* TRE_MBSTATE */
640
641   if (!mem)
642     return REG_ESPACE;
643   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
644   if (!stack)
645     {
646       ret = REG_ESPACE;
647       goto error_exit;
648     }
649   stack->prev = NULL;
650   stack->next = NULL;
651
652   if (tnfa->num_tags)
653     {
654       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
655       if (!tags)
656         {
657           ret = REG_ESPACE;
658           goto error_exit;
659         }
660     }
661   if (tnfa->num_submatches)
662     {
663       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
664       if (!pmatch)
665         {
666           ret = REG_ESPACE;
667           goto error_exit;
668         }
669     }
670   if (tnfa->num_states)
671     {
672       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
673       if (!states_seen)
674         {
675           ret = REG_ESPACE;
676           goto error_exit;
677         }
678     }
679
680  retry:
681   {
682     int i;
683     for (i = 0; i < tnfa->num_tags; i++)
684       {
685         tags[i] = -1;
686         if (match_tags)
687           match_tags[i] = -1;
688       }
689     for (i = 0; i < tnfa->num_states; i++)
690       states_seen[i] = 0;
691   }
692
693   state = NULL;
694   pos = pos_start;
695   GET_NEXT_WCHAR();
696   pos_start = pos;
697   next_c_start = next_c;
698   str_byte_start = str_byte;
699 #ifdef TRE_MBSTATE
700   mbstate_start = mbstate;
701 #endif /* TRE_MBSTATE */
702
703   /* Handle initial states. */
704   next_tags = NULL;
705   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
706     {
707       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
708         {
709           continue;
710         }
711       if (state == NULL)
712         {
713           /* Start from this state. */
714           state = trans_i->state;
715           next_tags = trans_i->tags;
716         }
717       else
718         {
719           /* Backtrack to this state. */
720           BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
721                         trans_i->state_id, next_c, tags, mbstate);
722           {
723             int *tmp = trans_i->tags;
724             if (tmp)
725               while (*tmp >= 0)
726                 stack->item.tags[*tmp++] = pos;
727           }
728         }
729     }
730
731   if (next_tags)
732     for (; *next_tags >= 0; next_tags++)
733       tags[*next_tags] = pos;
734
735
736   if (state == NULL)
737     goto backtrack;
738
739   while (1)
740     {
741       tre_tnfa_transition_t *next_state;
742       int empty_br_match;
743
744       if (state == tnfa->final)
745         {
746           if (match_eo < pos
747               || (match_eo == pos
748                   && match_tags
749                   && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
750                                    tags, match_tags)))
751             {
752               int i;
753               /* This match wins the previous match. */
754               match_eo = pos;
755               if (match_tags)
756                 for (i = 0; i < tnfa->num_tags; i++)
757                   match_tags[i] = tags[i];
758             }
759           /* Our TNFAs never have transitions leaving from the final state,
760              so we jump right to backtracking. */
761           goto backtrack;
762         }
763
764       /* Go to the next character in the input string. */
765       empty_br_match = 0;
766       trans_i = state;
767       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
768         {
769           /* This is a back reference state.  All transitions leaving from
770              this state have the same back reference "assertion".  Instead
771              of reading the next character, we match the back reference. */
772           regoff_t so, eo;
773           int bt = trans_i->u.backref;
774           regoff_t bt_len;
775           int result;
776
777           /* Get the substring we need to match against.  Remember to
778              turn off REG_NOSUB temporarily. */
779           tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
780                           tnfa, tags, pos);
781           so = pmatch[bt].rm_so;
782           eo = pmatch[bt].rm_eo;
783           bt_len = eo - so;
784
785           result = strncmp((const char*)string + so, str_byte - 1,
786                                  (size_t)bt_len);
787
788           if (result == 0)
789             {
790               /* Back reference matched.  Check for infinite loop. */
791               if (bt_len == 0)
792                 empty_br_match = 1;
793               if (empty_br_match && states_seen[trans_i->state_id])
794                 {
795                   goto backtrack;
796                 }
797
798               states_seen[trans_i->state_id] = empty_br_match;
799
800               /* Advance in input string and resync `prev_c', `next_c'
801                  and pos. */
802               str_byte += bt_len - 1;
803               pos += bt_len - 1;
804               GET_NEXT_WCHAR();
805             }
806           else
807             {
808               goto backtrack;
809             }
810         }
811       else
812         {
813           /* Check for end of string. */
814           if (next_c == L'\0')
815                 goto backtrack;
816
817           /* Read the next character. */
818           GET_NEXT_WCHAR();
819         }
820
821       next_state = NULL;
822       for (trans_i = state; trans_i->state; trans_i++)
823         {
824           if (trans_i->code_min <= (tre_cint_t)prev_c
825               && trans_i->code_max >= (tre_cint_t)prev_c)
826             {
827               if (trans_i->assertions
828                   && (CHECK_ASSERTIONS(trans_i->assertions)
829                       || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
830                 {
831                   continue;
832                 }
833
834               if (next_state == NULL)
835                 {
836                   /* First matching transition. */
837                   next_state = trans_i->state;
838                   next_tags = trans_i->tags;
839                 }
840               else
841                 {
842                   /* Second matching transition.  We may need to backtrack here
843                      to take this transition instead of the first one, so we
844                      push this transition in the backtracking stack so we can
845                      jump back here if needed. */
846                   BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
847                                 trans_i->state_id, next_c, tags, mbstate);
848                   {
849                     int *tmp;
850                     for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
851                       stack->item.tags[*tmp] = pos;
852                   }
853 #if 0 /* XXX - it's important not to look at all transitions here to keep
854          the stack small! */
855                   break;
856 #endif
857                 }
858             }
859         }
860
861       if (next_state != NULL)
862         {
863           /* Matching transitions were found.  Take the first one. */
864           state = next_state;
865
866           /* Update the tag values. */
867           if (next_tags)
868             while (*next_tags >= 0)
869               tags[*next_tags++] = pos;
870         }
871       else
872         {
873         backtrack:
874           /* A matching transition was not found.  Try to backtrack. */
875           if (stack->prev)
876             {
877               if (stack->item.state->assertions & ASSERT_BACKREF)
878                 {
879                   states_seen[stack->item.state_id] = 0;
880                 }
881
882               BT_STACK_POP();
883             }
884           else if (match_eo < 0)
885             {
886               /* Try starting from a later position in the input string. */
887               /* Check for end of string. */
888               if (next_c == L'\0')
889                     {
890                       break;
891                     }
892               next_c = next_c_start;
893 #ifdef TRE_MBSTATE
894               mbstate = mbstate_start;
895 #endif /* TRE_MBSTATE */
896               str_byte = str_byte_start;
897               goto retry;
898             }
899           else
900             {
901               break;
902             }
903         }
904     }
905
906   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
907   *match_end_ofs = match_eo;
908
909  error_exit:
910   tre_bt_mem_destroy(mem);
911 #ifndef TRE_USE_ALLOCA
912   if (tags)
913     xfree(tags);
914   if (pmatch)
915     xfree(pmatch);
916   if (states_seen)
917     xfree(states_seen);
918 #endif /* !TRE_USE_ALLOCA */
919
920   return ret;
921 }
922
923 /***********************************************************************
924  from regexec.c
925 ***********************************************************************/
926
927 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
928    endpoint values. */
929 static void
930 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
931                 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
932 {
933   tre_submatch_data_t *submatch_data;
934   unsigned int i, j;
935   int *parents;
936
937   i = 0;
938   if (match_eo >= 0 && !(cflags & REG_NOSUB))
939     {
940       /* Construct submatch offsets from the tags. */
941       submatch_data = tnfa->submatch_data;
942       while (i < tnfa->num_submatches && i < nmatch)
943         {
944           if (submatch_data[i].so_tag == tnfa->end_tag)
945             pmatch[i].rm_so = match_eo;
946           else
947             pmatch[i].rm_so = tags[submatch_data[i].so_tag];
948
949           if (submatch_data[i].eo_tag == tnfa->end_tag)
950             pmatch[i].rm_eo = match_eo;
951           else
952             pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
953
954           /* If either of the endpoints were not used, this submatch
955              was not part of the match. */
956           if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
957             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
958
959           i++;
960         }
961       /* Reset all submatches that are not within all of their parent
962          submatches. */
963       i = 0;
964       while (i < tnfa->num_submatches && i < nmatch)
965         {
966           if (pmatch[i].rm_eo == -1)
967             assert(pmatch[i].rm_so == -1);
968           assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
969
970           parents = submatch_data[i].parents;
971           if (parents != NULL)
972             for (j = 0; parents[j] >= 0; j++)
973               {
974                 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
975                     || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
976                   pmatch[i].rm_so = pmatch[i].rm_eo = -1;
977               }
978           i++;
979         }
980     }
981
982   while (i < nmatch)
983     {
984       pmatch[i].rm_so = -1;
985       pmatch[i].rm_eo = -1;
986       i++;
987     }
988 }
989
990
991 /*
992   Wrapper functions for POSIX compatible regexp matching.
993 */
994
995 int
996 regexec(const regex_t *restrict preg, const char *restrict string,
997           size_t nmatch, regmatch_t pmatch[restrict], int eflags)
998 {
999   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1000   reg_errcode_t status;
1001   regoff_t *tags = NULL, eo;
1002   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1003   if (tnfa->num_tags > 0 && nmatch > 0)
1004     {
1005       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1006       if (tags == NULL)
1007         return REG_ESPACE;
1008     }
1009
1010   /* Dispatch to the appropriate matcher. */
1011   if (tnfa->have_backrefs)
1012     {
1013       /* The regex has back references, use the backtracking matcher. */
1014       status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1015     }
1016   else
1017     {
1018       /* Exact matching, no back references, use the parallel matcher. */
1019       status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1020     }
1021
1022   if (status == REG_OK)
1023     /* A match was found, so fill the submatch registers. */
1024     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1025   if (tags)
1026     xfree(tags);
1027   return status;
1028 }