123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- From aee6abb2400b9a955c2b41166db1c22f63ad42ef Mon Sep 17 00:00:00 2001
- From: Rich Felker <[email protected]>
- Date: Thu, 6 Oct 2016 12:15:47 -0400
- Subject: fix regexec with haystack strings longer than INT_MAX
- we inherited from TRE regexec code that's utterly wrong with respect
- to the integer types it's using. while it doesn't appear that
- compilers are producing unsafe output, signed integer overflows seem
- to happen, and regexec fails to find matches past offset INT_MAX.
- this patch fixes the type of all variables/fields used to store
- offsets in the string from int to regoff_t. after the changes, basic
- testing showed that regexec can now find matches past 2GB (INT_MAX)
- and past 4GB on x86_64, and code generation is unchanged on i386.
- ---
- src/regex/regexec.c | 54 +++++++++++++++++++++++++++--------------------------
- 1 file changed, 28 insertions(+), 26 deletions(-)
- diff --git a/src/regex/regexec.c b/src/regex/regexec.c
- index dd52319..5c4cb92 100644
- --- a/src/regex/regexec.c
- +++ b/src/regex/regexec.c
- @@ -44,7 +44,7 @@
-
- static void
- tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- - const tre_tnfa_t *tnfa, int *tags, int match_eo);
- + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
-
- /***********************************************************************
- from tre-match-utils.h
- @@ -97,7 +97,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- /* Returns 1 if `t1' wins `t2', 0 otherwise. */
- static int
- tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
- - int *t1, int *t2)
- + regoff_t *t1, regoff_t *t2)
- {
- int i;
- for (i = 0; i < num_tags; i++)
- @@ -157,25 +157,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
-
- typedef struct {
- tre_tnfa_transition_t *state;
- - int *tags;
- + regoff_t *tags;
- } tre_tnfa_reach_t;
-
- typedef struct {
- - int pos;
- - int **tags;
- + regoff_t pos;
- + regoff_t **tags;
- } tre_reach_pos_t;
-
-
- static reg_errcode_t
- tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
- - int *match_tags, int eflags,
- - int *match_end_ofs)
- + regoff_t *match_tags, int eflags,
- + regoff_t *match_end_ofs)
- {
- /* State variables required by GET_NEXT_WCHAR. */
- tre_char_t prev_c = 0, next_c = 0;
- const char *str_byte = string;
- - int pos = -1;
- - int pos_add_next = 1;
- + regoff_t pos = -1;
- + regoff_t pos_add_next = 1;
- #ifdef TRE_MBSTATE
- mbstate_t mbstate;
- #endif /* TRE_MBSTATE */
- @@ -191,10 +191,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
- int *tag_i;
- int num_tags, i;
-
- - int match_eo = -1; /* end offset of match (-1 if no match found yet) */
- + regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
- int new_match = 0;
- - int *tmp_tags = NULL;
- - int *tmp_iptr;
- + regoff_t *tmp_tags = NULL;
- + regoff_t *tmp_iptr;
-
- #ifdef TRE_MBSTATE
- memset(&mbstate, '\0', sizeof(mbstate));
- @@ -214,7 +214,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
-
- /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
- * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
- - if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
- + if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
- goto error_exit;
-
- /* Likewise check rbytes. */
- @@ -229,7 +229,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
- tbytes = sizeof(*tmp_tags) * num_tags;
- rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
- pbytes = sizeof(*reach_pos) * tnfa->num_states;
- - xbytes = sizeof(int) * num_tags;
- + xbytes = sizeof(regoff_t) * num_tags;
- total_bytes =
- (sizeof(long) - 1) * 4 /* for alignment paddings */
- + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
- @@ -490,12 +490,12 @@ error_exit:
- */
-
- typedef struct {
- - int pos;
- + regoff_t pos;
- const char *str_byte;
- tre_tnfa_transition_t *state;
- int state_id;
- int next_c;
- - int *tags;
- + regoff_t *tags;
- #ifdef TRE_MBSTATE
- mbstate_t mbstate;
- #endif /* TRE_MBSTATE */
- @@ -591,13 +591,13 @@ typedef struct tre_backtrack_struct {
-
- static reg_errcode_t
- tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- - int *match_tags, int eflags, int *match_end_ofs)
- + regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
- {
- /* State variables required by GET_NEXT_WCHAR. */
- tre_char_t prev_c = 0, next_c = 0;
- const char *str_byte = string;
- - int pos = 0;
- - int pos_add_next = 1;
- + regoff_t pos = 0;
- + regoff_t pos_add_next = 1;
- #ifdef TRE_MBSTATE
- mbstate_t mbstate;
- #endif /* TRE_MBSTATE */
- @@ -610,15 +610,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- started from. */
- int next_c_start;
- const char *str_byte_start;
- - int pos_start = -1;
- + regoff_t pos_start = -1;
- #ifdef TRE_MBSTATE
- mbstate_t mbstate_start;
- #endif /* TRE_MBSTATE */
-
- /* End offset of best match so far, or -1 if no match found yet. */
- - int match_eo = -1;
- + regoff_t match_eo = -1;
- /* Tag arrays. */
- - int *next_tags, *tags = NULL;
- + int *next_tags;
- + regoff_t *tags = NULL;
- /* Current TNFA state. */
- tre_tnfa_transition_t *state;
- int *states_seen = NULL;
- @@ -768,8 +769,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- /* This is a back reference state. All transitions leaving from
- this state have the same back reference "assertion". Instead
- of reading the next character, we match the back reference. */
- - int so, eo, bt = trans_i->u.backref;
- - int bt_len;
- + regoff_t so, eo;
- + int bt = trans_i->u.backref;
- + regoff_t bt_len;
- int result;
-
- /* Get the substring we need to match against. Remember to
- @@ -926,7 +928,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- endpoint values. */
- static void
- tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- - const tre_tnfa_t *tnfa, int *tags, int match_eo)
- + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
- {
- tre_submatch_data_t *submatch_data;
- unsigned int i, j;
- @@ -996,7 +998,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
- {
- tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
- reg_errcode_t status;
- - int *tags = NULL, eo;
- + regoff_t *tags = NULL, eo;
- if (tnfa->cflags & REG_NOSUB) nmatch = 0;
- if (tnfa->num_tags > 0 && nmatch > 0)
- {
- --
- cgit v0.11.2
|