038-fix-regexec-with-haystack-strings-longer-than-int_max.patch 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. From aee6abb2400b9a955c2b41166db1c22f63ad42ef Mon Sep 17 00:00:00 2001
  2. From: Rich Felker <[email protected]>
  3. Date: Thu, 6 Oct 2016 12:15:47 -0400
  4. Subject: fix regexec with haystack strings longer than INT_MAX
  5. we inherited from TRE regexec code that's utterly wrong with respect
  6. to the integer types it's using. while it doesn't appear that
  7. compilers are producing unsafe output, signed integer overflows seem
  8. to happen, and regexec fails to find matches past offset INT_MAX.
  9. this patch fixes the type of all variables/fields used to store
  10. offsets in the string from int to regoff_t. after the changes, basic
  11. testing showed that regexec can now find matches past 2GB (INT_MAX)
  12. and past 4GB on x86_64, and code generation is unchanged on i386.
  13. ---
  14. src/regex/regexec.c | 54 +++++++++++++++++++++++++++--------------------------
  15. 1 file changed, 28 insertions(+), 26 deletions(-)
  16. diff --git a/src/regex/regexec.c b/src/regex/regexec.c
  17. index dd52319..5c4cb92 100644
  18. --- a/src/regex/regexec.c
  19. +++ b/src/regex/regexec.c
  20. @@ -44,7 +44,7 @@
  21. static void
  22. tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  23. - const tre_tnfa_t *tnfa, int *tags, int match_eo);
  24. + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
  25. /***********************************************************************
  26. from tre-match-utils.h
  27. @@ -97,7 +97,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  28. /* Returns 1 if `t1' wins `t2', 0 otherwise. */
  29. static int
  30. tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
  31. - int *t1, int *t2)
  32. + regoff_t *t1, regoff_t *t2)
  33. {
  34. int i;
  35. for (i = 0; i < num_tags; i++)
  36. @@ -157,25 +157,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
  37. typedef struct {
  38. tre_tnfa_transition_t *state;
  39. - int *tags;
  40. + regoff_t *tags;
  41. } tre_tnfa_reach_t;
  42. typedef struct {
  43. - int pos;
  44. - int **tags;
  45. + regoff_t pos;
  46. + regoff_t **tags;
  47. } tre_reach_pos_t;
  48. static reg_errcode_t
  49. tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
  50. - int *match_tags, int eflags,
  51. - int *match_end_ofs)
  52. + regoff_t *match_tags, int eflags,
  53. + regoff_t *match_end_ofs)
  54. {
  55. /* State variables required by GET_NEXT_WCHAR. */
  56. tre_char_t prev_c = 0, next_c = 0;
  57. const char *str_byte = string;
  58. - int pos = -1;
  59. - int pos_add_next = 1;
  60. + regoff_t pos = -1;
  61. + regoff_t pos_add_next = 1;
  62. #ifdef TRE_MBSTATE
  63. mbstate_t mbstate;
  64. #endif /* TRE_MBSTATE */
  65. @@ -191,10 +191,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
  66. int *tag_i;
  67. int num_tags, i;
  68. - int match_eo = -1; /* end offset of match (-1 if no match found yet) */
  69. + regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
  70. int new_match = 0;
  71. - int *tmp_tags = NULL;
  72. - int *tmp_iptr;
  73. + regoff_t *tmp_tags = NULL;
  74. + regoff_t *tmp_iptr;
  75. #ifdef TRE_MBSTATE
  76. memset(&mbstate, '\0', sizeof(mbstate));
  77. @@ -214,7 +214,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
  78. /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
  79. * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
  80. - if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
  81. + if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
  82. goto error_exit;
  83. /* Likewise check rbytes. */
  84. @@ -229,7 +229,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
  85. tbytes = sizeof(*tmp_tags) * num_tags;
  86. rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
  87. pbytes = sizeof(*reach_pos) * tnfa->num_states;
  88. - xbytes = sizeof(int) * num_tags;
  89. + xbytes = sizeof(regoff_t) * num_tags;
  90. total_bytes =
  91. (sizeof(long) - 1) * 4 /* for alignment paddings */
  92. + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
  93. @@ -490,12 +490,12 @@ error_exit:
  94. */
  95. typedef struct {
  96. - int pos;
  97. + regoff_t pos;
  98. const char *str_byte;
  99. tre_tnfa_transition_t *state;
  100. int state_id;
  101. int next_c;
  102. - int *tags;
  103. + regoff_t *tags;
  104. #ifdef TRE_MBSTATE
  105. mbstate_t mbstate;
  106. #endif /* TRE_MBSTATE */
  107. @@ -591,13 +591,13 @@ typedef struct tre_backtrack_struct {
  108. static reg_errcode_t
  109. tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  110. - int *match_tags, int eflags, int *match_end_ofs)
  111. + regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
  112. {
  113. /* State variables required by GET_NEXT_WCHAR. */
  114. tre_char_t prev_c = 0, next_c = 0;
  115. const char *str_byte = string;
  116. - int pos = 0;
  117. - int pos_add_next = 1;
  118. + regoff_t pos = 0;
  119. + regoff_t pos_add_next = 1;
  120. #ifdef TRE_MBSTATE
  121. mbstate_t mbstate;
  122. #endif /* TRE_MBSTATE */
  123. @@ -610,15 +610,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  124. started from. */
  125. int next_c_start;
  126. const char *str_byte_start;
  127. - int pos_start = -1;
  128. + regoff_t pos_start = -1;
  129. #ifdef TRE_MBSTATE
  130. mbstate_t mbstate_start;
  131. #endif /* TRE_MBSTATE */
  132. /* End offset of best match so far, or -1 if no match found yet. */
  133. - int match_eo = -1;
  134. + regoff_t match_eo = -1;
  135. /* Tag arrays. */
  136. - int *next_tags, *tags = NULL;
  137. + int *next_tags;
  138. + regoff_t *tags = NULL;
  139. /* Current TNFA state. */
  140. tre_tnfa_transition_t *state;
  141. int *states_seen = NULL;
  142. @@ -768,8 +769,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  143. /* This is a back reference state. All transitions leaving from
  144. this state have the same back reference "assertion". Instead
  145. of reading the next character, we match the back reference. */
  146. - int so, eo, bt = trans_i->u.backref;
  147. - int bt_len;
  148. + regoff_t so, eo;
  149. + int bt = trans_i->u.backref;
  150. + regoff_t bt_len;
  151. int result;
  152. /* Get the substring we need to match against. Remember to
  153. @@ -926,7 +928,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  154. endpoint values. */
  155. static void
  156. tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  157. - const tre_tnfa_t *tnfa, int *tags, int match_eo)
  158. + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
  159. {
  160. tre_submatch_data_t *submatch_data;
  161. unsigned int i, j;
  162. @@ -996,7 +998,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
  163. {
  164. tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
  165. reg_errcode_t status;
  166. - int *tags = NULL, eo;
  167. + regoff_t *tags = NULL, eo;
  168. if (tnfa->cflags & REG_NOSUB) nmatch = 0;
  169. if (tnfa->num_tags > 0 && nmatch > 0)
  170. {
  171. --
  172. cgit v0.11.2