tre_ast_node_t *n;
/* Position in the regexp pattern after a parse function returns. */
const char *s;
- /* The first character of the regexp. */
- const char *re;
+ /* The first character of the last subexpression parsed. */
+ const char *start;
/* Current submatch ID. */
int submatch_id;
/* Current position (number of literal). */
goto parse_bracket_done;
if (neg.negate) {
+ /*
+ * With REG_NEWLINE, POSIX requires that newlines are not matched by
+ * any form of a non-matching list.
+ */
+ if (ctx->cflags & REG_NEWLINE) {
+ lit = tre_new_lit(&ls);
+ if (!lit) {
+ err = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ lit->code_min = '\n';
+ lit->code_max = '\n';
+ lit->position = -1;
+ }
/* Sort the array if we need to negate it. */
qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
/* extra lit for the last negated range */
return s;
}
-static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
+static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
{
int min, max;
max > RE_DUP_MAX ||
min > RE_DUP_MAX ||
min < 0 ||
- (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
+ (!ere && *s++ != '\\') ||
*s++ != '}'
)
- return REG_BADBR;
-
- if (min == 0 && max == 0)
- ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- else
- ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
- if (!ctx->n)
- return REG_ESPACE;
- ctx->s = s;
- return REG_OK;
+ return 0;
+ *pmin = min;
+ *pmax = max;
+ return s;
}
static int hexval(unsigned c)
break;
case '^':
/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
- if (!ere && s != ctx->re)
+ if (!ere && s != ctx->start)
goto parse_literal;
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
s++;
break;
case '$':
- /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
- if (!ere && s[1])
+ /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
+ if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
goto parse_literal;
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
s++;
break;
case '*':
- return REG_BADPAT;
case '{':
case '+':
case '?':
{
tre_ast_node_t *nbranch=0, *nunion=0;
int ere = ctx->cflags & REG_EXTENDED;
- const char *s = ctx->re;
+ const char *s = ctx->start;
int subid = 0;
int depth = 0;
reg_errcode_t err;
s++;
depth++;
nbranch = nunion = 0;
+ ctx->start = s;
continue;
}
if ((!ere && *s == '\\' && s[1] == ')') ||
}
parse_iter:
- /* extension: repetitions are rejected after an empty node
- eg. (+), |*, {2}, but assertions are not treated as empty
- so ^* or $? are accepted currently. */
for (;;) {
+ int min, max;
+
if (*s!='\\' && *s!='*') {
if (!ere)
break;
if (*s=='\\')
s++;
+ /* handle ^* at the start of a BRE. */
+ if (!ere && s==ctx->start+1 && s[-1]=='^')
+ break;
+
/* extension: multiple consecutive *+?{,} is unspecified,
but (a+)+ has to be supported so accepting a++ makes
sense, note however that the RE_DUP_MAX limit can be
circumvented: (a{255}){255} uses a lot of memory.. */
if (*s=='{') {
- err = parse_dup(ctx, s+1);
- if (err != REG_OK)
- return err;
- s = ctx->s;
+ s = parse_dup(s+1, ere, &min, &max);
+ if (!s)
+ return REG_BADBR;
} else {
- int min=0, max=-1;
+ min=0;
+ max=-1;
if (*s == '+')
min = 1;
if (*s == '?')
max = 1;
s++;
- ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
- if (!ctx->n)
- return REG_ESPACE;
}
+ if (max == 0)
+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ else
+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+ if (!ctx->n)
+ return REG_ESPACE;
}
nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
if (c == '\\' && s[1] == '|') {
s+=2;
+ ctx->start = s;
} else if (c == '|') {
s++;
+ ctx->start = s;
} else {
if (c == '\\') {
if (!depth) return REG_EPAREN;
c->right->firstpos = NULL;
c->right->lastpos = NULL;
c->right->num_tags = 0;
+ c->right->num_submatches = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
c->left->firstpos = NULL;
c->left->lastpos = NULL;
c->left->num_tags = 0;
+ c->left->num_submatches = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
/* Allocate a stack used throughout the compilation process for various
purposes. */
- stack = tre_stack_new(512, 10240, 128);
+ stack = tre_stack_new(512, 1024000, 128);
if (!stack)
return REG_ESPACE;
/* Allocate a fast memory allocator. */
memset(&parse_ctx, 0, sizeof(parse_ctx));
parse_ctx.mem = mem;
parse_ctx.stack = stack;
- parse_ctx.re = regex;
+ parse_ctx.start = regex;
parse_ctx.cflags = cflags;
parse_ctx.max_backref = -1;
errcode = tre_parse(&parse_ctx);