X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=src%2Fregex%2Fregcomp.c;h=fb24556ebe12e33cd07d86a7953e2ce30f45c3b7;hb=25ea9f712c30c32957de493d4711ee39d0bbb024;hp=b3dbb25231cd55ec0b715fd41553619fffa9e6e9;hpb=25160f1c08235cf5b6a9617c5640380618a0f6ff;p=musl diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index b3dbb252..fb24556e 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -401,8 +401,8 @@ typedef struct { tre_ast_node_t *n; /* Position in the regexp pattern after a parse function returns. */ const char *s; - /* The first character of the regexp. */ - const char *re; + /* The first character of the last subexpression parsed. */ + const char *start; /* Current submatch ID. */ int submatch_id; /* Current position (number of literal). */ @@ -636,6 +636,20 @@ static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s) goto parse_bracket_done; if (neg.negate) { + /* + * With REG_NEWLINE, POSIX requires that newlines are not matched by + * any form of a non-matching list. + */ + if (ctx->cflags & REG_NEWLINE) { + lit = tre_new_lit(&ls); + if (!lit) { + err = REG_ESPACE; + goto parse_bracket_done; + } + lit->code_min = '\n'; + lit->code_max = '\n'; + lit->position = -1; + } /* Sort the array if we need to negate it. */ qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit); /* extra lit for the last negated range */ @@ -708,7 +722,7 @@ static const char *parse_dup_count(const char *s, int *n) return s; } -static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s) +static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax) { int min, max; @@ -723,19 +737,13 @@ static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s) max > RE_DUP_MAX || min > RE_DUP_MAX || min < 0 || - (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') || + (!ere && *s++ != '\\') || *s++ != '}' ) - return REG_BADBR; - - if (min == 0 && max == 0) - ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - else - ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); - if (!ctx->n) - return REG_ESPACE; - ctx->s = s; - return REG_OK; + return 0; + *pmin = min; + *pmax = max; + return s; } static int hexval(unsigned c) @@ -882,20 +890,19 @@ static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) break; case '^': /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ - if (!ere && s != ctx->re) + if (!ere && s != ctx->start) goto parse_literal; node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); s++; break; case '$': - /* '$' is special everywhere in EREs, and in the end of the string in BREs. */ - if (!ere && s[1]) + /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */ + if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|'))) goto parse_literal; node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); s++; break; case '*': - return REG_BADPAT; case '{': case '+': case '?': @@ -951,7 +958,7 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) { tre_ast_node_t *nbranch=0, *nunion=0; int ere = ctx->cflags & REG_EXTENDED; - const char *s = ctx->re; + const char *s = ctx->start; int subid = 0; int depth = 0; reg_errcode_t err; @@ -969,6 +976,7 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) s++; depth++; nbranch = nunion = 0; + ctx->start = s; continue; } if ((!ere && *s == '\\' && s[1] == ')') || @@ -984,10 +992,9 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) } parse_iter: - /* extension: repetitions are rejected after an empty node - eg. (+), |*, {2}, but assertions are not treated as empty - so ^* or $? are accepted currently. */ for (;;) { + int min, max; + if (*s!='\\' && *s!='*') { if (!ere) break; @@ -1002,26 +1009,33 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) if (*s=='\\') s++; + /* handle ^* at the start of a BRE. */ + if (!ere && s==ctx->start+1 && s[-1]=='^') + break; + /* extension: multiple consecutive *+?{,} is unspecified, but (a+)+ has to be supported so accepting a++ makes sense, note however that the RE_DUP_MAX limit can be circumvented: (a{255}){255} uses a lot of memory.. */ if (*s=='{') { - err = parse_dup(ctx, s+1); - if (err != REG_OK) - return err; - s = ctx->s; + s = parse_dup(s+1, ere, &min, &max); + if (!s) + return REG_BADBR; } else { - int min=0, max=-1; + min=0; + max=-1; if (*s == '+') min = 1; if (*s == '?') max = 1; s++; - ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); - if (!ctx->n) - return REG_ESPACE; } + if (max == 0) + ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + else + ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); + if (!ctx->n) + return REG_ESPACE; } nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); @@ -1039,8 +1053,10 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) if (c == '\\' && s[1] == '|') { s+=2; + ctx->start = s; } else if (c == '|') { s++; + ctx->start = s; } else { if (c == '\\') { if (!depth) return REG_EPAREN; @@ -1107,6 +1123,7 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c->right->firstpos = NULL; c->right->lastpos = NULL; c->right->num_tags = 0; + c->right->num_submatches = 0; node->obj = c; node->type = CATENATION; return REG_OK; @@ -1137,6 +1154,7 @@ tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c->left->firstpos = NULL; c->left->lastpos = NULL; c->left->num_tags = 0; + c->left->num_submatches = 0; node->obj = c; node->type = CATENATION; return REG_OK; @@ -2689,7 +2707,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) /* Allocate a stack used throughout the compilation process for various purposes. */ - stack = tre_stack_new(512, 10240, 128); + stack = tre_stack_new(512, 1024000, 128); if (!stack) return REG_ESPACE; /* Allocate a fast memory allocator. */ @@ -2704,7 +2722,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) memset(&parse_ctx, 0, sizeof(parse_ctx)); parse_ctx.mem = mem; parse_ctx.stack = stack; - parse_ctx.re = regex; + parse_ctx.start = regex; parse_ctx.cflags = cflags; parse_ctx.max_backref = -1; errcode = tre_parse(&parse_ctx);