tre_ast_node_t *n;
/* Position in the regexp pattern after a parse function returns. */
const char *s;
- /* The first character of the regexp. */
- const char *re;
+ /* The first character of the last subexpression parsed. */
+ const char *start;
/* Current submatch ID. */
int submatch_id;
/* Current position (number of literal). */
goto parse_bracket_done;
if (neg.negate) {
+ /*
+ * With REG_NEWLINE, POSIX requires that newlines are not matched by
+ * any form of a non-matching list.
+ */
+ if (ctx->cflags & REG_NEWLINE) {
+ lit = tre_new_lit(&ls);
+ if (!lit) {
+ err = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ lit->code_min = '\n';
+ lit->code_max = '\n';
+ lit->position = -1;
+ }
/* Sort the array if we need to negate it. */
qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
/* extra lit for the last negated range */
return s;
}
-static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
+static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
{
int min, max;
max > RE_DUP_MAX ||
min > RE_DUP_MAX ||
min < 0 ||
- (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
+ (!ere && *s++ != '\\') ||
*s++ != '}'
)
- return REG_BADBR;
-
- if (min == 0 && max == 0)
- ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- else
- ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
- if (!ctx->n)
- return REG_ESPACE;
- ctx->s = s;
- return REG_OK;
+ return 0;
+ *pmin = min;
+ *pmax = max;
+ return s;
}
static int hexval(unsigned c)
s--;
break;
case '{':
+ case '+':
+ case '?':
+ /* extension: treat \+, \? as repetitions in BRE */
/* reject repetitions after empty expression in BRE */
if (!ere)
return REG_BADRPT;
break;
case '^':
/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
- if (!ere && s != ctx->re)
+ if (!ere && s != ctx->start)
goto parse_literal;
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
s++;
break;
case '$':
- /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
- if (!ere && s[1])
+ /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
+ if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
goto parse_literal;
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
s++;
break;
case '*':
- return REG_BADPAT;
case '{':
case '+':
case '?':
{
tre_ast_node_t *nbranch=0, *nunion=0;
int ere = ctx->cflags & REG_EXTENDED;
- const char *s = ctx->re;
+ const char *s = ctx->start;
int subid = 0;
int depth = 0;
reg_errcode_t err;
s++;
depth++;
nbranch = nunion = 0;
+ ctx->start = s;
continue;
}
if ((!ere && *s == '\\' && s[1] == ')') ||
}
parse_iter:
- /* extension: repetitions are rejected after an empty node
- eg. (+), |*, {2}, but assertions are not treated as empty
- so ^* or $? are accepted currently. */
- switch (*s) {
- case '+':
- case '?':
- if (!ere)
+ for (;;) {
+ int min, max;
+
+ if (*s!='\\' && *s!='*') {
+ if (!ere)
+ break;
+ if (*s!='+' && *s!='?' && *s!='{')
+ break;
+ }
+ if (*s=='\\' && ere)
break;
- /* fallthrough */
- case '*':;
- int min=0, max=-1;
- if (*s == '+')
- min = 1;
- if (*s == '?')
- max = 1;
- s++;
- ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
- if (!ctx->n)
- return REG_ESPACE;
+ /* extension: treat \+, \? as repetitions in BRE */
+ if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
+ break;
+ if (*s=='\\')
+ s++;
+
+ /* handle ^* at the start of a BRE. */
+ if (!ere && s==ctx->start+1 && s[-1]=='^')
+ break;
+
/* extension: multiple consecutive *+?{,} is unspecified,
but (a+)+ has to be supported so accepting a++ makes
sense, note however that the RE_DUP_MAX limit can be
circumvented: (a{255}){255} uses a lot of memory.. */
- goto parse_iter;
- case '\\':
- if (ere || s[1] != '{')
- break;
- s++;
- goto parse_brace;
- case '{':
- if (!ere)
- break;
- parse_brace:
- err = parse_dup(ctx, s+1);
- if (err != REG_OK)
- return err;
- s = ctx->s;
- goto parse_iter;
+ if (*s=='{') {
+ s = parse_dup(s+1, ere, &min, &max);
+ if (!s)
+ return REG_BADBR;
+ } else {
+ min=0;
+ max=-1;
+ if (*s == '+')
+ min = 1;
+ if (*s == '?')
+ max = 1;
+ s++;
+ }
+ if (max == 0)
+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ else
+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+ if (!ctx->n)
+ return REG_ESPACE;
}
nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
if (c == '\\' && s[1] == '|') {
s+=2;
+ ctx->start = s;
} else if (c == '|') {
s++;
+ ctx->start = s;
} else {
if (c == '\\') {
if (!depth) return REG_EPAREN;
c->right->firstpos = NULL;
c->right->lastpos = NULL;
c->right->num_tags = 0;
+ c->right->num_submatches = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
c->left->firstpos = NULL;
c->left->lastpos = NULL;
c->left->num_tags = 0;
+ c->left->num_submatches = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
/* Allocate a stack used throughout the compilation process for various
purposes. */
- stack = tre_stack_new(512, 10240, 128);
+ stack = tre_stack_new(512, 1024000, 128);
if (!stack)
return REG_ESPACE;
/* Allocate a fast memory allocator. */
memset(&parse_ctx, 0, sizeof(parse_ctx));
parse_ctx.mem = mem;
parse_ctx.stack = stack;
- parse_ctx.re = regex;
+ parse_ctx.start = regex;
parse_ctx.cflags = cflags;
parse_ctx.max_backref = -1;
errcode = tre_parse(&parse_ctx);