initial check-in, version 0.5.0
[musl] / src / regex / regexec.c
1 /*
2   regexec.c - TRE POSIX compatible matching functions (and more).
3
4   Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5
6   This library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   This library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with this library; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19
20 */
21
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25 #include <wctype.h>
26 #include <limits.h>
27
28 #include <regex.h>
29
30 #include "tre.h"
31
32 #include <assert.h>
33
34 static void
35 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
36                 const tre_tnfa_t *tnfa, int *tags, int match_eo);
37
38 /***********************************************************************
39  from tre-match-utils.h
40 ***********************************************************************/
41
42 #define GET_NEXT_WCHAR() do {                                                 \
43     prev_c = next_c; pos += pos_add_next;                                     \
44     if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
45         if (pos_add_next < 0) return REG_NOMATCH;                             \
46         else pos_add_next++;                                                  \
47     }                                                                         \
48     str_byte += pos_add_next;                                                 \
49   } while (0)
50
51 #define CHECK_ASSERTIONS(assertions)                                          \
52   (((assertions & ASSERT_AT_BOL)                                              \
53     && (pos > 0 || reg_notbol)                                                \
54     && (prev_c != L'\n' || !reg_newline))                                     \
55    || ((assertions & ASSERT_AT_EOL)                                           \
56        && (next_c != L'\0' || reg_noteol)                                     \
57        && (next_c != L'\n' || !reg_newline)))
58
59 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
60 static int
61 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
62               int *t1, int *t2)
63 {
64   int i;
65   for (i = 0; i < num_tags; i++)
66     {
67       if (tag_directions[i] == TRE_TAG_MINIMIZE)
68         {
69           if (t1[i] < t2[i])
70             return 1;
71           if (t1[i] > t2[i])
72             return 0;
73         }
74       else
75         {
76           if (t1[i] > t2[i])
77             return 1;
78           if (t1[i] < t2[i])
79             return 0;
80         }
81     }
82   /*  assert(0);*/
83   return 0;
84 }
85
86 static int
87 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
88 {
89   DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase));
90   while (*classes != (tre_ctype_t)0)
91     if ((!icase && tre_isctype(wc, *classes))
92         || (icase && (tre_isctype(tre_toupper(wc), *classes)
93                       || tre_isctype(tre_tolower(wc), *classes))))
94       return 1; /* Match. */
95     else
96       classes++;
97   return 0; /* No match. */
98 }
99
100
101 /***********************************************************************
102  from tre-match-parallel.c
103 ***********************************************************************/
104
105 /*
106   This algorithm searches for matches basically by reading characters
107   in the searched string one by one, starting at the beginning.  All
108   matching paths in the TNFA are traversed in parallel.  When two or
109   more paths reach the same state, exactly one is chosen according to
110   tag ordering rules; if returning submatches is not required it does
111   not matter which path is chosen.
112
113   The worst case time required for finding the leftmost and longest
114   match, or determining that there is no match, is always linearly
115   dependent on the length of the text being searched.
116
117   This algorithm cannot handle TNFAs with back referencing nodes.
118   See `tre-match-backtrack.c'.
119 */
120
121
122 typedef struct {
123   tre_tnfa_transition_t *state;
124   int *tags;
125 } tre_tnfa_reach_t;
126
127 typedef struct {
128   int pos;
129   int **tags;
130 } tre_reach_pos_t;
131
132
133 #ifdef TRE_DEBUG
134 static void
135 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
136 {
137   int i;
138
139   while (reach->state != NULL)
140     {
141       DPRINT((" %p", (void *)reach->state));
142       if (num_tags > 0)
143         {
144           DPRINT(("/"));
145           for (i = 0; i < num_tags; i++)
146             {
147               DPRINT(("%d:%d", i, reach->tags[i]));
148               if (i < (num_tags-1))
149                 DPRINT((","));
150             }
151         }
152       reach++;
153     }
154   DPRINT(("\n"));
155
156 }
157 #endif /* TRE_DEBUG */
158
159 static reg_errcode_t
160 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
161                       int *match_tags, int eflags, int *match_end_ofs)
162 {
163   /* State variables required by GET_NEXT_WCHAR. */
164   tre_char_t prev_c = 0, next_c = 0;
165   const char *str_byte = string;
166   int pos = -1;
167   int pos_add_next = 1;
168 #ifdef TRE_MBSTATE
169   mbstate_t mbstate;
170 #endif /* TRE_MBSTATE */
171   int reg_notbol = eflags & REG_NOTBOL;
172   int reg_noteol = eflags & REG_NOTEOL;
173   int reg_newline = tnfa->cflags & REG_NEWLINE;
174
175   char *buf;
176   tre_tnfa_transition_t *trans_i;
177   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
178   tre_reach_pos_t *reach_pos;
179   int *tag_i;
180   int num_tags, i;
181
182   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
183   int new_match = 0;
184   int *tmp_tags = NULL;
185   int *tmp_iptr;
186
187 #ifdef TRE_MBSTATE
188   memset(&mbstate, '\0', sizeof(mbstate));
189 #endif /* TRE_MBSTATE */
190
191   if (!match_tags)
192     num_tags = 0;
193   else
194     num_tags = tnfa->num_tags;
195
196   /* Allocate memory for temporary data required for matching.  This needs to
197      be done for every matching operation to be thread safe.  This allocates
198      everything in a single large block from the stack frame using alloca()
199      or with malloc() if alloca is unavailable. */
200   {
201     int tbytes, rbytes, pbytes, xbytes, total_bytes;
202     char *tmp_buf;
203     /* Compute the length of the block we need. */
204     tbytes = sizeof(*tmp_tags) * num_tags;
205     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
206     pbytes = sizeof(*reach_pos) * tnfa->num_states;
207     xbytes = sizeof(int) * num_tags;
208     total_bytes =
209       (sizeof(long) - 1) * 4 /* for alignment paddings */
210       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
211
212     /* Allocate the memory. */
213 #ifdef TRE_USE_ALLOCA
214     buf = alloca(total_bytes);
215 #else /* !TRE_USE_ALLOCA */
216     buf = xmalloc(total_bytes);
217 #endif /* !TRE_USE_ALLOCA */
218     if (buf == NULL)
219       return REG_ESPACE;
220     memset(buf, 0, total_bytes);
221
222     /* Get the various pointers within tmp_buf (properly aligned). */
223     tmp_tags = (void *)buf;
224     tmp_buf = buf + tbytes;
225     tmp_buf += ALIGN(tmp_buf, long);
226     reach_next = (void *)tmp_buf;
227     tmp_buf += rbytes;
228     tmp_buf += ALIGN(tmp_buf, long);
229     reach = (void *)tmp_buf;
230     tmp_buf += rbytes;
231     tmp_buf += ALIGN(tmp_buf, long);
232     reach_pos = (void *)tmp_buf;
233     tmp_buf += pbytes;
234     tmp_buf += ALIGN(tmp_buf, long);
235     for (i = 0; i < tnfa->num_states; i++)
236       {
237         reach[i].tags = (void *)tmp_buf;
238         tmp_buf += xbytes;
239         reach_next[i].tags = (void *)tmp_buf;
240         tmp_buf += xbytes;
241       }
242   }
243
244   for (i = 0; i < tnfa->num_states; i++)
245     reach_pos[i].pos = -1;
246
247   GET_NEXT_WCHAR();
248   pos = 0;
249
250   DPRINT(("length: %d\n", len));
251   DPRINT(("pos:chr/code | states and tags\n"));
252   DPRINT(("-------------+------------------------------------------------\n"));
253
254   reach_next_i = reach_next;
255   while (1)
256     {
257       /* If no match found yet, add the initial states to `reach_next'. */
258       if (match_eo < 0)
259         {
260           DPRINT((" init >"));
261           trans_i = tnfa->initial;
262           while (trans_i->state != NULL)
263             {
264               if (reach_pos[trans_i->state_id].pos < pos)
265                 {
266                   if (trans_i->assertions
267                       && CHECK_ASSERTIONS(trans_i->assertions))
268                     {
269                       DPRINT(("assertion failed\n"));
270                       trans_i++;
271                       continue;
272                     }
273
274                   DPRINT((" %p", (void *)trans_i->state));
275                   reach_next_i->state = trans_i->state;
276                   for (i = 0; i < num_tags; i++)
277                     reach_next_i->tags[i] = -1;
278                   tag_i = trans_i->tags;
279                   if (tag_i)
280                     while (*tag_i >= 0)
281                       {
282                         if (*tag_i < num_tags)
283                           reach_next_i->tags[*tag_i] = pos;
284                         tag_i++;
285                       }
286                   if (reach_next_i->state == tnfa->final)
287                     {
288                       DPRINT(("  found empty match\n"));
289                       match_eo = pos;
290                       new_match = 1;
291                       for (i = 0; i < num_tags; i++)
292                         match_tags[i] = reach_next_i->tags[i];
293                     }
294                   reach_pos[trans_i->state_id].pos = pos;
295                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
296                   reach_next_i++;
297                 }
298               trans_i++;
299             }
300           DPRINT(("\n"));
301           reach_next_i->state = NULL;
302         }
303       else
304         {
305           if (num_tags == 0 || reach_next_i == reach_next)
306             /* We have found a match. */
307             break;
308         }
309
310       /* Check for end of string. */
311       if (!next_c) break;
312
313       GET_NEXT_WCHAR();
314
315 #ifdef TRE_DEBUG
316       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
317       tre_print_reach(tnfa, reach_next, num_tags);
318       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
319       tre_print_reach(tnfa, reach_next, num_tags);
320 #endif /* TRE_DEBUG */
321
322       /* Swap `reach' and `reach_next'. */
323       reach_i = reach;
324       reach = reach_next;
325       reach_next = reach_i;
326
327       /* For each state in `reach' see if there is a transition leaving with
328          the current input symbol to a state not yet in `reach_next', and
329          add the destination states to `reach_next'. */
330       reach_next_i = reach_next;
331       for (reach_i = reach; reach_i->state; reach_i++)
332         {
333           for (trans_i = reach_i->state; trans_i->state; trans_i++)
334             {
335               /* Does this transition match the input symbol? */
336               if (trans_i->code_min <= prev_c &&
337                   trans_i->code_max >= prev_c)
338                 {
339                   if (trans_i->assertions
340                       && (CHECK_ASSERTIONS(trans_i->assertions)
341                           /* Handle character class transitions. */
342                           || ((trans_i->assertions & ASSERT_CHAR_CLASS)
343                               && !(tnfa->cflags & REG_ICASE)
344                               && !tre_isctype((tre_cint_t)prev_c,
345                                               trans_i->u.class))
346                           || ((trans_i->assertions & ASSERT_CHAR_CLASS)
347                               && (tnfa->cflags & REG_ICASE)
348                               && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
349                                                trans_i->u.class)
350                                   && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
351                                                   trans_i->u.class)))
352                           || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
353                               && tre_neg_char_classes_match(trans_i->neg_classes,
354                                                             (tre_cint_t)prev_c,
355                                                             tnfa->cflags & REG_ICASE))))
356                     {
357                       DPRINT(("assertion failed\n"));
358                       continue;
359                     }
360
361                   /* Compute the tags after this transition. */
362                   for (i = 0; i < num_tags; i++)
363                     tmp_tags[i] = reach_i->tags[i];
364                   tag_i = trans_i->tags;
365                   if (tag_i != NULL)
366                     while (*tag_i >= 0)
367                       {
368                         if (*tag_i < num_tags)
369                           tmp_tags[*tag_i] = pos;
370                         tag_i++;
371                       }
372
373                   if (reach_pos[trans_i->state_id].pos < pos)
374                     {
375                       /* Found an unvisited node. */
376                       reach_next_i->state = trans_i->state;
377                       tmp_iptr = reach_next_i->tags;
378                       reach_next_i->tags = tmp_tags;
379                       tmp_tags = tmp_iptr;
380                       reach_pos[trans_i->state_id].pos = pos;
381                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
382
383                       if (reach_next_i->state == tnfa->final
384                           && (match_eo == -1
385                               || (num_tags > 0
386                                   && reach_next_i->tags[0] <= match_tags[0])))
387                         {
388                           DPRINT(("  found match %p\n", trans_i->state));
389                           match_eo = pos;
390                           new_match = 1;
391                           for (i = 0; i < num_tags; i++)
392                             match_tags[i] = reach_next_i->tags[i];
393                         }
394                       reach_next_i++;
395
396                     }
397                   else
398                     {
399                       assert(reach_pos[trans_i->state_id].pos == pos);
400                       /* Another path has also reached this state.  We choose
401                          the winner by examining the tag values for both
402                          paths. */
403                       if (tre_tag_order(num_tags, tnfa->tag_directions,
404                                         tmp_tags,
405                                         *reach_pos[trans_i->state_id].tags))
406                         {
407                           /* The new path wins. */
408                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
409                           *reach_pos[trans_i->state_id].tags = tmp_tags;
410                           if (trans_i->state == tnfa->final)
411                             {
412                               DPRINT(("  found better match\n"));
413                               match_eo = pos;
414                               new_match = 1;
415                               for (i = 0; i < num_tags; i++)
416                                 match_tags[i] = tmp_tags[i];
417                             }
418                           tmp_tags = tmp_iptr;
419                         }
420                     }
421                 }
422             }
423         }
424       reach_next_i->state = NULL;
425     }
426
427   DPRINT(("match end offset = %d\n", match_eo));
428
429 #ifndef TRE_USE_ALLOCA
430   if (buf)
431     xfree(buf);
432 #endif /* !TRE_USE_ALLOCA */
433
434   *match_end_ofs = match_eo;
435   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
436 }
437
438
439 /***********************************************************************
440  from tre-match-backtrack.c
441 ***********************************************************************/
442
443 /*
444   This matcher is for regexps that use back referencing.  Regexp matching
445   with back referencing is an NP-complete problem on the number of back
446   references.  The easiest way to match them is to use a backtracking
447   routine which basically goes through all possible paths in the TNFA
448   and chooses the one which results in the best (leftmost and longest)
449   match.  This can be spectacularly expensive and may run out of stack
450   space, but there really is no better known generic algorithm.  Quoting
451   Henry Spencer from comp.compilers:
452   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
453
454     POSIX.2 REs require longest match, which is really exciting to
455     implement since the obsolete ("basic") variant also includes
456     \<digit>.  I haven't found a better way of tackling this than doing
457     a preliminary match using a DFA (or simulation) on a modified RE
458     that just replicates subREs for \<digit>, and then doing a
459     backtracking match to determine whether the subRE matches were
460     right.  This can be rather slow, but I console myself with the
461     thought that people who use \<digit> deserve very slow execution.
462     (Pun unintentional but very appropriate.)
463
464 */
465
466 typedef struct {
467   int pos;
468   const char *str_byte;
469   tre_tnfa_transition_t *state;
470   int state_id;
471   int next_c;
472   int *tags;
473 #ifdef TRE_MBSTATE
474   mbstate_t mbstate;
475 #endif /* TRE_MBSTATE */
476 } tre_backtrack_item_t;
477
478 typedef struct tre_backtrack_struct {
479   tre_backtrack_item_t item;
480   struct tre_backtrack_struct *prev;
481   struct tre_backtrack_struct *next;
482 } *tre_backtrack_t;
483
484 #ifdef TRE_MBSTATE
485 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
486 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
487 #else /* !TRE_MBSTATE */
488 #define BT_STACK_MBSTATE_IN
489 #define BT_STACK_MBSTATE_OUT
490 #endif /* !TRE_MBSTATE */
491
492
493 #ifdef TRE_USE_ALLOCA
494 #define tre_bt_mem_new            tre_mem_newa
495 #define tre_bt_mem_alloc          tre_mem_alloca
496 #define tre_bt_mem_destroy(obj)   do { } while (0)
497 #else /* !TRE_USE_ALLOCA */
498 #define tre_bt_mem_new            tre_mem_new
499 #define tre_bt_mem_alloc          tre_mem_alloc
500 #define tre_bt_mem_destroy        tre_mem_destroy
501 #endif /* !TRE_USE_ALLOCA */
502
503
504 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
505   do                                                                          \
506     {                                                                         \
507       int i;                                                                  \
508       if (!stack->next)                                                       \
509         {                                                                     \
510           tre_backtrack_t s;                                                  \
511           s = tre_bt_mem_alloc(mem, sizeof(*s));                              \
512           if (!s)                                                             \
513             {                                                                 \
514               tre_bt_mem_destroy(mem);                                        \
515               if (tags)                                                       \
516                 xfree(tags);                                                  \
517               if (pmatch)                                                     \
518                 xfree(pmatch);                                                \
519               if (states_seen)                                                \
520                 xfree(states_seen);                                           \
521               return REG_ESPACE;                                              \
522             }                                                                 \
523           s->prev = stack;                                                    \
524           s->next = NULL;                                                     \
525           s->item.tags = tre_bt_mem_alloc(mem,                                \
526                                           sizeof(*tags) * tnfa->num_tags);    \
527           if (!s->item.tags)                                                  \
528             {                                                                 \
529               tre_bt_mem_destroy(mem);                                        \
530               if (tags)                                                       \
531                 xfree(tags);                                                  \
532               if (pmatch)                                                     \
533                 xfree(pmatch);                                                \
534               if (states_seen)                                                \
535                 xfree(states_seen);                                           \
536               return REG_ESPACE;                                              \
537             }                                                                 \
538           stack->next = s;                                                    \
539           stack = s;                                                          \
540         }                                                                     \
541       else                                                                    \
542         stack = stack->next;                                                  \
543       stack->item.pos = (_pos);                                               \
544       stack->item.str_byte = (_str_byte);                                     \
545       stack->item.state = (_state);                                           \
546       stack->item.state_id = (_state_id);                                     \
547       stack->item.next_c = (_next_c);                                         \
548       for (i = 0; i < tnfa->num_tags; i++)                                    \
549         stack->item.tags[i] = (_tags)[i];                                     \
550       BT_STACK_MBSTATE_IN;                                                    \
551     }                                                                         \
552   while (0)
553
554 #define BT_STACK_POP()                                                        \
555   do                                                                          \
556     {                                                                         \
557       int i;                                                                  \
558       assert(stack->prev);                                                    \
559       pos = stack->item.pos;                                                  \
560       str_byte = stack->item.str_byte;                                        \
561       state = stack->item.state;                                              \
562       next_c = stack->item.next_c;                                            \
563       for (i = 0; i < tnfa->num_tags; i++)                                    \
564         tags[i] = stack->item.tags[i];                                        \
565       BT_STACK_MBSTATE_OUT;                                                   \
566       stack = stack->prev;                                                    \
567     }                                                                         \
568   while (0)
569
570 #undef MIN
571 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
572
573 static reg_errcode_t
574 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
575                        int len, int *match_tags,
576                        int eflags, int *match_end_ofs)
577 {
578   /* State variables required by GET_NEXT_WCHAR. */
579   tre_char_t prev_c = 0, next_c = 0;
580   const char *str_byte = string;
581   int pos = 0;
582   int pos_add_next = 1;
583 #ifdef TRE_MBSTATE
584   mbstate_t mbstate;
585 #endif /* TRE_MBSTATE */
586   int reg_notbol = eflags & REG_NOTBOL;
587   int reg_noteol = eflags & REG_NOTEOL;
588   int reg_newline = tnfa->cflags & REG_NEWLINE;
589
590   /* These are used to remember the necessary values of the above
591      variables to return to the position where the current search
592      started from. */
593   int next_c_start;
594   const char *str_byte_start;
595   int pos_start = -1;
596 #ifdef TRE_MBSTATE
597   mbstate_t mbstate_start;
598 #endif /* TRE_MBSTATE */
599
600   /* Compilation flags for this regexp. */
601   int cflags = tnfa->cflags;
602
603   /* End offset of best match so far, or -1 if no match found yet. */
604   int match_eo = -1;
605   /* Tag arrays. */
606   int *next_tags, *tags = NULL;
607   /* Current TNFA state. */
608   tre_tnfa_transition_t *state;
609   int *states_seen = NULL;
610
611   /* Memory allocator to for allocating the backtracking stack. */
612   tre_mem_t mem = tre_bt_mem_new();
613
614   /* The backtracking stack. */
615   tre_backtrack_t stack;
616
617   tre_tnfa_transition_t *trans_i;
618   regmatch_t *pmatch = NULL;
619   int ret;
620
621 #ifdef TRE_MBSTATE
622   memset(&mbstate, '\0', sizeof(mbstate));
623 #endif /* TRE_MBSTATE */
624
625   if (!mem)
626     return REG_ESPACE;
627   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
628   if (!stack)
629     {
630       ret = REG_ESPACE;
631       goto error_exit;
632     }
633   stack->prev = NULL;
634   stack->next = NULL;
635
636 #ifdef TRE_USE_ALLOCA
637   tags = alloca(sizeof(*tags) * tnfa->num_tags);
638   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
639   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
640 #else /* !TRE_USE_ALLOCA */
641   tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
642   if (!tags)
643     {
644       ret = REG_ESPACE;
645       goto error_exit;
646     }
647   pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
648   if (!pmatch)
649     {
650       ret = REG_ESPACE;
651       goto error_exit;
652     }
653   states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
654   if (!states_seen)
655     {
656       ret = REG_ESPACE;
657       goto error_exit;
658     }
659 #endif /* !TRE_USE_ALLOCA */
660
661  retry:
662   {
663     int i;
664     for (i = 0; i < tnfa->num_tags; i++)
665       {
666         tags[i] = -1;
667         if (match_tags)
668           match_tags[i] = -1;
669       }
670     for (i = 0; i < tnfa->num_states; i++)
671       states_seen[i] = 0;
672   }
673
674   state = NULL;
675   pos = pos_start;
676   GET_NEXT_WCHAR();
677   pos_start = pos;
678   next_c_start = next_c;
679   str_byte_start = str_byte;
680 #ifdef TRE_MBSTATE
681   mbstate_start = mbstate;
682 #endif /* TRE_MBSTATE */
683
684   /* Handle initial states. */
685   next_tags = NULL;
686   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
687     {
688       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
689       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
690         {
691           DPRINT(("assert failed\n"));
692           continue;
693         }
694       if (state == NULL)
695         {
696           /* Start from this state. */
697           state = trans_i->state;
698           next_tags = trans_i->tags;
699         }
700       else
701         {
702           /* Backtrack to this state. */
703           DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
704           BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
705                         trans_i->state_id, next_c, tags, mbstate);
706           {
707             int *tmp = trans_i->tags;
708             if (tmp)
709               while (*tmp >= 0)
710                 stack->item.tags[*tmp++] = pos;
711           }
712         }
713     }
714
715   if (next_tags)
716     for (; *next_tags >= 0; next_tags++)
717       tags[*next_tags] = pos;
718
719
720   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
721   DPRINT(("pos:chr/code | state and tags\n"));
722   DPRINT(("-------------+------------------------------------------------\n"));
723
724   if (state == NULL)
725     goto backtrack;
726
727   while (1)
728     {
729       tre_tnfa_transition_t *trans_i, *next_state;
730       int empty_br_match;
731
732       DPRINT(("start loop\n"));
733       if (state == tnfa->final)
734         {
735           DPRINT(("  match found, %d %d\n", match_eo, pos));
736           if (match_eo < pos
737               || (match_eo == pos
738                   && match_tags
739                   && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
740                                    tags, match_tags)))
741             {
742               int i;
743               /* This match wins the previous match. */
744               DPRINT(("  win previous\n"));
745               match_eo = pos;
746               if (match_tags)
747                 for (i = 0; i < tnfa->num_tags; i++)
748                   match_tags[i] = tags[i];
749             }
750           /* Our TNFAs never have transitions leaving from the final state,
751              so we jump right to backtracking. */
752           goto backtrack;
753         }
754
755 #ifdef TRE_DEBUG
756       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
757               state));
758       {
759         int i;
760         for (i = 0; i < tnfa->num_tags; i++)
761           DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
762         DPRINT(("\n"));
763       }
764 #endif /* TRE_DEBUG */
765
766       /* Go to the next character in the input string. */
767       empty_br_match = 0;
768       trans_i = state;
769       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
770         {
771           /* This is a back reference state.  All transitions leaving from
772              this state have the same back reference "assertion".  Instead
773              of reading the next character, we match the back reference. */
774           int so, eo, bt = trans_i->u.backref;
775           int bt_len;
776           int result;
777
778           DPRINT(("  should match back reference %d\n", bt));
779           /* Get the substring we need to match against.  Remember to
780              turn off REG_NOSUB temporarily. */
781           tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB,
782                           tnfa, tags, pos);
783           so = pmatch[bt].rm_so;
784           eo = pmatch[bt].rm_eo;
785           bt_len = eo - so;
786
787           if (len < 0)
788             {
789               result = strncmp((char*)string + so, str_byte - 1, bt_len);
790             }
791           else if (len - pos < bt_len)
792             result = 1;
793           else
794             result = memcmp((char*)string + so, str_byte - 1, bt_len);
795
796           /* We can ignore multibyte characters here because the backref
797              string is already aligned at character boundaries. */
798           if (result == 0)
799             {
800               /* Back reference matched.  Check for infinite loop. */
801               if (bt_len == 0)
802                 empty_br_match = 1;
803               if (empty_br_match && states_seen[trans_i->state_id])
804                 {
805                   DPRINT(("  avoid loop\n"));
806                   goto backtrack;
807                 }
808
809               states_seen[trans_i->state_id] = empty_br_match;
810
811               /* Advance in input string and resync `prev_c', `next_c'
812                  and pos. */
813               DPRINT(("  back reference matched\n"));
814               str_byte += bt_len - 1;
815               pos += bt_len - 1;
816               GET_NEXT_WCHAR();
817               DPRINT(("  pos now %d\n", pos));
818             }
819           else
820             {
821               DPRINT(("  back reference did not match\n"));
822               goto backtrack;
823             }
824         }
825       else
826         {
827           /* Check for end of string. */
828           if (len < 0)
829             {
830               if (next_c == L'\0')
831                 goto backtrack;
832             }
833           else
834             {
835               if (pos >= len)
836                 goto backtrack;
837             }
838
839           /* Read the next character. */
840           GET_NEXT_WCHAR();
841         }
842
843       next_state = NULL;
844       for (trans_i = state; trans_i->state; trans_i++)
845         {
846           DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
847                   trans_i->code_min, trans_i->code_max,
848                   trans_i->code_min, trans_i->code_max,
849                   trans_i->assertions, trans_i->state_id));
850           if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c)
851             {
852               if (trans_i->assertions
853                   && (CHECK_ASSERTIONS(trans_i->assertions)
854                       /* Handle character class transitions. */
855                       || ((trans_i->assertions & ASSERT_CHAR_CLASS)
856                           && !(cflags & REG_ICASE)
857                           && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))
858                       || ((trans_i->assertions & ASSERT_CHAR_CLASS)
859                           && (cflags & REG_ICASE)
860                           && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
861                                            trans_i->u.class)
862                               && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
863                                               trans_i->u.class)))
864                       || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
865                           && tre_neg_char_classes_match(trans_i->neg_classes,
866                                                         (tre_cint_t)prev_c,
867                                                         cflags & REG_ICASE))))
868                 {
869                   DPRINT(("  assertion failed\n"));
870                   continue;
871                 }
872
873               if (next_state == NULL)
874                 {
875                   /* First matching transition. */
876                   DPRINT(("  Next state is %d\n", trans_i->state_id));
877                   next_state = trans_i->state;
878                   next_tags = trans_i->tags;
879                 }
880               else
881                 {
882                   /* Second mathing transition.  We may need to backtrack here
883                      to take this transition instead of the first one, so we
884                      push this transition in the backtracking stack so we can
885                      jump back here if needed. */
886                   DPRINT(("  saving state %d for backtracking\n",
887                           trans_i->state_id));
888                   BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
889                                 trans_i->state_id, next_c, tags, mbstate);
890                   {
891                     int *tmp;
892                     for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
893                       stack->item.tags[*tmp] = pos;
894                   }
895 #if 0 /* XXX - it's important not to look at all transitions here to keep
896          the stack small! */
897                   break;
898 #endif
899                 }
900             }
901         }
902
903       if (next_state != NULL)
904         {
905           /* Matching transitions were found.  Take the first one. */
906           state = next_state;
907
908           /* Update the tag values. */
909           if (next_tags)
910             while (*next_tags >= 0)
911               tags[*next_tags++] = pos;
912         }
913       else
914         {
915         backtrack:
916           /* A matching transition was not found.  Try to backtrack. */
917           if (stack->prev)
918             {
919               DPRINT(("  backtracking\n"));
920               if (stack->item.state->assertions && ASSERT_BACKREF)
921                 {
922                   DPRINT(("  states_seen[%d] = 0\n",
923                           stack->item.state_id));
924                   states_seen[stack->item.state_id] = 0;
925                 }
926
927               BT_STACK_POP();
928             }
929           else if (match_eo < 0)
930             {
931               /* Try starting from a later position in the input string. */
932               /* Check for end of string. */
933               if (len < 0)
934                 {
935                   if (next_c == L'\0')
936                     {
937                       DPRINT(("end of string.\n"));
938                       break;
939                     }
940                 }
941               else
942                 {
943                   if (pos >= len)
944                     {
945                       DPRINT(("end of string.\n"));
946                       break;
947                     }
948                 }
949               DPRINT(("restarting from next start position\n"));
950               next_c = next_c_start;
951 #ifdef TRE_MBSTATE
952               mbstate = mbstate_start;
953 #endif /* TRE_MBSTATE */
954               str_byte = str_byte_start;
955               goto retry;
956             }
957           else
958             {
959               DPRINT(("finished\n"));
960               break;
961             }
962         }
963     }
964
965   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
966   *match_end_ofs = match_eo;
967
968  error_exit:
969   tre_bt_mem_destroy(mem);
970 #ifndef TRE_USE_ALLOCA
971   if (tags)
972     xfree(tags);
973   if (pmatch)
974     xfree(pmatch);
975   if (states_seen)
976     xfree(states_seen);
977 #endif /* !TRE_USE_ALLOCA */
978
979   return ret;
980 }
981
982
983 /***********************************************************************
984  from regexec.c
985 ***********************************************************************/
986
987 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
988    endpoint values. */
989 static void
990 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
991                 const tre_tnfa_t *tnfa, int *tags, int match_eo)
992 {
993   tre_submatch_data_t *submatch_data;
994   unsigned int i, j;
995   int *parents;
996
997   i = 0;
998   if (match_eo >= 0 && !(cflags & REG_NOSUB))
999     {
1000       /* Construct submatch offsets from the tags. */
1001       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
1002       submatch_data = tnfa->submatch_data;
1003       while (i < tnfa->num_submatches && i < nmatch)
1004         {
1005           if (submatch_data[i].so_tag == tnfa->end_tag)
1006             pmatch[i].rm_so = match_eo;
1007           else
1008             pmatch[i].rm_so = tags[submatch_data[i].so_tag];
1009
1010           if (submatch_data[i].eo_tag == tnfa->end_tag)
1011             pmatch[i].rm_eo = match_eo;
1012           else
1013             pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
1014
1015           /* If either of the endpoints were not used, this submatch
1016              was not part of the match. */
1017           if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
1018             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
1019
1020           DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
1021                   submatch_data[i].so_tag, pmatch[i].rm_so,
1022                   submatch_data[i].eo_tag, pmatch[i].rm_eo));
1023           i++;
1024         }
1025       /* Reset all submatches that are not within all of their parent
1026          submatches. */
1027       i = 0;
1028       while (i < tnfa->num_submatches && i < nmatch)
1029         {
1030           if (pmatch[i].rm_eo == -1)
1031             assert(pmatch[i].rm_so == -1);
1032           assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
1033
1034           parents = submatch_data[i].parents;
1035           if (parents != NULL)
1036             for (j = 0; parents[j] >= 0; j++)
1037               {
1038                 DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
1039                 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
1040                     || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
1041                   pmatch[i].rm_so = pmatch[i].rm_eo = -1;
1042               }
1043           i++;
1044         }
1045     }
1046
1047   while (i < nmatch)
1048     {
1049       pmatch[i].rm_so = -1;
1050       pmatch[i].rm_eo = -1;
1051       i++;
1052     }
1053 }
1054
1055
1056 /*
1057   Wrapper functions for POSIX compatible regexp matching.
1058 */
1059
1060 static int
1061 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
1062           size_t nmatch, regmatch_t pmatch[], int eflags)
1063 {
1064   reg_errcode_t status;
1065   int *tags = NULL, eo;
1066   if (tnfa->num_tags > 0 && nmatch > 0)
1067     {
1068 #ifdef TRE_USE_ALLOCA
1069       tags = alloca(sizeof(*tags) * tnfa->num_tags);
1070 #else /* !TRE_USE_ALLOCA */
1071       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1072 #endif /* !TRE_USE_ALLOCA */
1073       if (tags == NULL)
1074         return REG_ESPACE;
1075     }
1076
1077   /* Dispatch to the appropriate matcher. */
1078   if (tnfa->have_backrefs)
1079     {
1080       /* The regex has back references, use the backtracking matcher. */
1081       status = tre_tnfa_run_backtrack(tnfa, string, len, tags, eflags, &eo);
1082     }
1083   else
1084     {
1085       /* Exact matching, no back references, use the parallel matcher. */
1086       status = tre_tnfa_run_parallel(tnfa, string, len, tags, eflags, &eo);
1087     }
1088
1089   if (status == REG_OK)
1090     /* A match was found, so fill the submatch registers. */
1091     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1092 #ifndef TRE_USE_ALLOCA
1093   if (tags)
1094     xfree(tags);
1095 #endif /* !TRE_USE_ALLOCA */
1096   return status;
1097 }
1098
1099 int
1100 regexec(const regex_t *preg, const char *str,
1101         size_t nmatch, regmatch_t pmatch[], int eflags)
1102 {
1103   return tre_match((void *)preg->TRE_REGEX_T_FIELD, str, -1,
1104                    nmatch, pmatch, eflags);
1105 }
1106
1107 /* EOF */