Browse Source

regex: Match ^ at most once in repeated searches

When doing successive matches, track the input start and current search
start positions separately to prevent the `^` anchor from matching in
the middle of the string.  Add policy CMP0186 to provide compatibility.

Issue: #26629
Fixes: #16899
Nikita Nemkin 8 months ago
parent
commit
5d039f3be3

+ 5 - 0
Help/command/list.rst

@@ -246,6 +246,11 @@ For more information on regular expressions look under
                                       <replace_expression> ...)
         :target: TRANSFORM_REPLACE
 
+      .. versionchanged:: 4.1
+        The ``^`` anchor now matches only at the beginning of the input
+        element instead of the beginning of each repeated search.
+        See policy :policy:`CMP0186`.
+
   ``<SELECTOR>`` determines which elements of the list will be transformed.
   Only one type of selector can be specified at a time.
   When given, ``<SELECTOR>`` must be one of the following:

+ 5 - 0
Help/command/string.rst

@@ -117,6 +117,11 @@ Search and Replace With Regular Expressions
   two backslashes (``\\1``) are required in CMake code to get a backslash
   through argument parsing.
 
+.. versionchanged:: 4.1
+  The ``^`` anchor now matches only at the beginning of the input
+  string instead of the beginning of each repeated search.
+  See policy :policy:`CMP0186`.
+
 .. _`Regex Specification`:
 
 Regex Specification

+ 5 - 0
Help/manual/cmake-generator-expressions.7.rst

@@ -532,6 +532,11 @@ List Transformations
 
         $<LIST:TRANSFORM,list,REPLACE,regular_expression,replace_expression[,SELECTOR]>
 
+      .. versionchanged:: 4.1
+        The ``^`` anchor now matches only at the beginning of the input
+        element instead of the beginning of each repeated search.
+        See policy :policy:`CMP0186`.
+
   ``SELECTOR`` determines which items of the list will be transformed.
   Only one type of selector can be specified at a time. When given,
   ``SELECTOR`` must be one of the following:

+ 8 - 0
Help/manual/cmake-policies.7.rst

@@ -92,6 +92,14 @@ Supported Policies
 
 The following policies are supported.
 
+Policies Introduced by CMake 4.1
+--------------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+   CMP0186: Regular expressions match ^ at most once in repeated searches. </policy/CMP0186>
+
 Policies Introduced by CMake 4.0
 --------------------------------
 

+ 43 - 0
Help/policy/CMP0186.rst

@@ -0,0 +1,43 @@
+CMP0186
+-------
+
+.. versionadded:: 4.1
+
+Regular expressions match ``^`` at most once in repeated searches.
+
+This policy affects commands that perform multiple regular expression
+searches:
+
+* :command:`string(REGEX MATCHALL)`
+* :command:`string(REGEX REPLACE)`
+* :command:`list(TRANSFORM REPLACE)`
+
+and the generator expression :genex:`$<LIST:TRANSFORM,list,REPLACE>`.
+
+CMake 4.0 and below match the ``^`` anchor at the start of every
+successive search, leading to multiple matches:
+
+.. code-block:: cmake
+
+  string(REGEX REPLACE "^a" "b" result "aaaa") # result="bbbb"
+  string(REGEX MATCHALL "^a" result "aaaa")    # result="a;a;a;a"
+
+CMake 4.1 and above prefer to match the ``^`` anchor at most once,
+at the start of the input string:
+
+.. code-block:: cmake
+
+  string(REGEX REPLACE "^a" "b" result "aaaa") # result="abbb"
+  string(REGEX MATCHALL "^a" result "aaaa")    # result="a"
+
+This policy provides compatibility for projects that have not been updated.
+
+The ``OLD`` behavior for this policy is to match ``^`` multiple times,
+at the start of each search.  The ``NEW`` behavior for this policy is
+to match ``^`` at most once, at the start of the input string.
+
+.. |INTRODUCED_IN_CMAKE_VERSION| replace:: 4.1
+.. |WARNS_OR_DOES_NOT_WARN| replace:: does *not* warn
+.. include:: STANDARD_ADVICE.txt
+
+.. include:: DEPRECATED.txt

+ 5 - 0
Help/release/dev/regex-fixes.rst

@@ -0,0 +1,5 @@
+regex-fixes
+-----------
+
+* Regular expressions match the ``^`` anchor at most once in repeated
+  searches, at the start of the input.  See policy :policy:`CMP0186`.

+ 5 - 0
Source/cmGeneratorExpressionNode.cxx

@@ -1706,6 +1706,11 @@ static const struct ListNode : public cmGeneratorExpressionNode
                     return std::string{};
                   }
 
+                  if (!selector) {
+                    selector = cmList::TransformSelector::New();
+                  }
+                  selector->Makefile = ctx->LG->GetMakefile();
+
                   return list
                     .transform(descriptor->Action, arguments,
                                std::move(selector))

+ 7 - 2
Source/cmList.cxx

@@ -523,8 +523,8 @@ public:
                   std::string const& replace) override
   {
     TransformAction::Initialize(selector);
-    this->ReplaceHelper =
-      cm::make_unique<cmStringReplaceHelper>(regex, replace);
+    this->ReplaceHelper = cm::make_unique<cmStringReplaceHelper>(
+      regex, replace, selector->Makefile);
 
     if (!this->ReplaceHelper->IsRegularExpressionValid()) {
       throw transform_error(
@@ -643,6 +643,11 @@ ActionDescriptorSet::iterator TransformConfigure(
 }
 }
 
+std::unique_ptr<cmList::TransformSelector> cmList::TransformSelector::New()
+{
+  return cm::make_unique<TransformNoSelector>();
+}
+
 std::unique_ptr<cmList::TransformSelector> cmList::TransformSelector::NewAT(
   std::initializer_list<index_type> indexes)
 {

+ 4 - 0
Source/cmList.h

@@ -23,6 +23,7 @@
 
 template <typename T>
 class BT;
+class cmMakefile;
 
 /**
  * CMake lists management
@@ -893,6 +894,7 @@ public:
     // cmList::TransformSelector::New<AT>({1, 2, 5, 6});
     //  or
     // cmList::TransformSelector::New<REGEX>("^XX.*");
+    static std::unique_ptr<TransformSelector> New();
     template <typename Type>
     static std::unique_ptr<TransformSelector> New(
       std::initializer_list<index_type>);
@@ -907,6 +909,8 @@ public:
     template <typename Type>
     static std::unique_ptr<TransformSelector> New(std::string&&);
 
+    cmMakefile* Makefile = nullptr;
+
   private:
     static std::unique_ptr<TransformSelector> NewAT(
       std::initializer_list<index_type> init);

+ 5 - 0
Source/cmListCommand.cxx

@@ -678,6 +678,11 @@ bool HandleTransformCommand(std::vector<std::string> const& args,
       return true;
     }
 
+    if (!selector) {
+      selector = cmList::TransformSelector::New();
+    }
+    selector->Makefile = &status.GetMakefile();
+
     list->transform(descriptor->Action, arguments, std::move(selector));
     status.GetMakefile().AddDefinition(outputName, list->to_string());
     return true;

+ 4 - 1
Source/cmPolicies.h

@@ -555,7 +555,10 @@ class cmMakefile;
          WARN)                                                                \
   SELECT(POLICY, CMP0185,                                                     \
          "FindRuby no longer provides upper-case RUBY_* variables.", 4, 0, 0, \
-         WARN)
+         WARN)                                                                \
+  SELECT(POLICY, CMP0186,                                                     \
+         "Regular expressions match ^ at most once in repeated searches.", 4, \
+         1, 0, WARN)
 
 #define CM_SELECT_ID(F, A1, A2, A3, A4, A5, A6) F(A1)
 #define CM_FOR_EACH_POLICY_ID(POLICY)                                         \

+ 11 - 4
Source/cmStringCommand.cxx

@@ -29,6 +29,7 @@
 #include "cmGeneratorExpression.h"
 #include "cmMakefile.h"
 #include "cmMessageType.h"
+#include "cmPolicies.h"
 #include "cmRange.h"
 #include "cmStringAlgorithms.h"
 #include "cmStringReplaceHelper.h"
@@ -288,10 +289,16 @@ bool RegexMatchAll(std::vector<std::string> const& args,
   // Concatenate all the last arguments together.
   std::string input = cmJoin(cmMakeRange(args).advance(4), std::string());
 
+  unsigned optAnchor = 0;
+  if (status.GetMakefile().GetPolicyStatus(cmPolicies::CMP0186) !=
+      cmPolicies::NEW) {
+    optAnchor = cmsys::RegularExpression::BOL_AT_OFFSET;
+  }
+
   // Scan through the input for all matches.
   std::string output;
-  char const* p = input.c_str();
-  while (re.find(p)) {
+  std::string::size_type base = 0;
+  while (re.find(input, base, optAnchor)) {
     status.GetMakefile().ClearMatches();
     status.GetMakefile().StoreMatches(re);
     std::string::size_type l = re.start();
@@ -305,8 +312,8 @@ bool RegexMatchAll(std::vector<std::string> const& args,
     if (!output.empty()) {
       output += ";";
     }
-    output += std::string(p + l, r - l);
-    p += r;
+    output += re.match();
+    base = r;
   }
 
   // Store the output in the provided variable.

+ 14 - 9
Source/cmStringReplaceHelper.cxx

@@ -7,6 +7,7 @@
 #include <utility>
 
 #include "cmMakefile.h"
+#include "cmPolicies.h"
 
 cmStringReplaceHelper::cmStringReplaceHelper(std::string const& regex,
                                              std::string replace_expr,
@@ -24,9 +25,16 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
 {
   output.clear();
 
+  unsigned optAnchor = 0;
+  if (this->Makefile &&
+      this->Makefile->GetPolicyStatus(cmPolicies::CMP0186) !=
+        cmPolicies::NEW) {
+    optAnchor = cmsys::RegularExpression::BOL_AT_OFFSET;
+  }
+
   // Scan through the input for all matches.
   std::string::size_type base = 0;
-  while (this->RegularExpression.find(input.c_str() + base)) {
+  while (this->RegularExpression.find(input, base, optAnchor)) {
     if (this->Makefile) {
       this->Makefile->ClearMatches();
       this->Makefile->StoreMatches(this->RegularExpression);
@@ -35,7 +43,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
     auto r = this->RegularExpression.end();
 
     // Concatenate the part of the input that was not matched.
-    output += input.substr(base, l2);
+    output += input.substr(base, l2 - base);
 
     // Make sure the match had some text.
     if (r - l2 == 0) {
@@ -54,11 +62,8 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
         // Replace with part of the match.
         auto n = replacement.Number;
         auto start = this->RegularExpression.start(n);
-        auto end = this->RegularExpression.end(n);
-        auto len = input.length() - base;
-        if ((start != std::string::npos) && (end != std::string::npos) &&
-            (start <= len) && (end <= len)) {
-          output += input.substr(base + start, end - start);
+        if (start != std::string::npos) {
+          output += this->RegularExpression.match(n);
         } else {
           std::ostringstream error;
           error << "replace expression \"" << this->ReplaceExpression
@@ -71,11 +76,11 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
     }
 
     // Move past the match.
-    base += r;
+    base = r;
   }
 
   // Concatenate the text after the last match.
-  output += input.substr(base, input.length() - base);
+  output += input.substr(base);
 
   return true;
 }

+ 1 - 1
Tests/CMakeLib/testList.cxx

@@ -740,7 +740,7 @@ bool testTransform()
     cmList list({ "ABC", "BBCB", "BCCCBC", "BCBCDD", "EBCBCEBC" });
 
     list.transform(cmList::TransformAction::REPLACE, "^BC|BC$", "X");
-    if (list.to_string() != "AX;BBCB;XCCX;XXDD;EBCBCEX") {
+    if (list.to_string() != "AX;BBCB;XCCX;XBCDD;EBCBCEX") {
       result = false;
     }
   }

+ 1 - 1
Tests/RunCMake/CPack/ArchiveCommon/common_helpers.cmake

@@ -60,7 +60,7 @@ function(toExpectedContentList FILE_NO CONTENT_VAR)
 
   unset(filtered_)
   foreach(part_ IN LISTS prepared_)
-    string(REGEX REPLACE "^/" "" part_ "${part_}")
+    string(REGEX REPLACE "^/+" "" part_ "${part_}")
 
     if(part_)
       list(APPEND filtered_ "${prefix_}${part_}")

+ 11 - 0
Tests/RunCMake/GenEx-LIST/CMP0186-NEW-check.cmake

@@ -0,0 +1,11 @@
+set(expected "
+  000;1001;002
+  x000;1001;x002
+  x000;x01;x002
+")
+
+file(READ "${RunCMake_TEST_BINARY_DIR}/generated.txt" generated)
+
+if(NOT generated STREQUAL expected)
+  set(RunCMake_TEST_FAILED "generated:${generated}\nexpected:${expected}")
+endif()

+ 5 - 0
Tests/RunCMake/GenEx-LIST/CMP0186-NEW.cmake

@@ -0,0 +1,5 @@
+file(GENERATE OUTPUT "generated.txt" CONTENT "
+  $<LIST:TRANSFORM,0000;1001;0002,REPLACE,^0,>
+  $<LIST:TRANSFORM,0000;1001;0002,REPLACE,^(a|0),x>
+  $<LIST:TRANSFORM,0000;1001;0002,REPLACE,(1|^)0,x>
+")

+ 11 - 0
Tests/RunCMake/GenEx-LIST/CMP0186-OLD-check.cmake

@@ -0,0 +1,11 @@
+set(expected "
+  ;1001;2
+  xxxx;1001;xxx2
+  xxxx;xx1;xxx2
+")
+
+file(READ "${RunCMake_TEST_BINARY_DIR}/generated.txt" generated)
+
+if(NOT generated STREQUAL expected)
+  set(RunCMake_TEST_FAILED "generated:${generated}\nexpected:${expected}")
+endif()

+ 1 - 0
Tests/RunCMake/GenEx-LIST/CMP0186-OLD.cmake

@@ -0,0 +1 @@
+include(CMP0186-NEW.cmake)

+ 3 - 0
Tests/RunCMake/GenEx-LIST/RunCMakeTest.cmake

@@ -128,3 +128,6 @@ check_list_execution (TRANSFORM-PREPEND)
 check_list_execution (TRANSFORM-REPLACE)
 check_list_execution (REVERSE)
 check_list_execution (SORT)
+
+run_cmake_with_options(CMP0186-OLD -DCMAKE_POLICY_DEFAULT_CMP0186=OLD)
+run_cmake_with_options(CMP0186-NEW -DCMAKE_POLICY_DEFAULT_CMP0186=NEW)

+ 43 - 0
Tests/RunCMake/list/CMP0186.cmake

@@ -0,0 +1,43 @@
+set(mylist 0000 1001 0002)
+
+# OLD
+cmake_policy(SET CMP0186 OLD)
+
+unset(output)
+list(TRANSFORM mylist REPLACE "^0" "" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL ";1001;2")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \";1001;2\"")
+endif()
+
+unset(output)
+list(TRANSFORM mylist REPLACE "^(a|0)" "x" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL "xxxx;1001;xxx2")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \"xxxx;1001;xxx2\"")
+endif()
+
+unset(output)
+list(TRANSFORM mylist REPLACE "(1|^)0" "x" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL "xxxx;xx1;xxx2")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \"xxxx;xx1;xxx2\"")
+endif()
+
+# NEW, same cases as above
+cmake_policy(SET CMP0186 NEW)
+
+unset(output)
+list(TRANSFORM mylist REPLACE "^0" "" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL "000;1001;002")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \"000;1001;002\"")
+endif()
+
+unset(output)
+list(TRANSFORM mylist REPLACE "^(a|0)" "x" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL "x000;1001;x002")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \"x000;1001;x002\"")
+endif()
+
+unset(output)
+list(TRANSFORM mylist REPLACE "(1|^)0" "x" OUTPUT_VARIABLE output)
+if (NOT output STREQUAL "x000;x01;x002")
+  message(FATAL_ERROR "TRANSFORM(REPLACE) is \"${output}\", expected is \"x000;xx1;x002\"")
+endif()

+ 1 - 0
Tests/RunCMake/list/RunCMakeTest.cmake

@@ -90,6 +90,7 @@ run_cmake(TRANSFORM-GENEX_STRIP)
 run_cmake(TRANSFORM-APPEND)
 run_cmake(TRANSFORM-PREPEND)
 run_cmake(TRANSFORM-REPLACE)
+run_cmake(CMP0186)
 
 # argument tests
 run_cmake(SORT-WrongOption)

+ 90 - 0
Tests/RunCMake/string/CMP0186.cmake

@@ -0,0 +1,90 @@
+function(check_output name expected)
+  set(output "${${name}}")
+  if(NOT output STREQUAL expected)
+    message(FATAL_ERROR "\"string(REGEX)\" set ${name} to \"${output}\", expected \"${expected}\"")
+  endif()
+endfunction()
+
+# OLD
+cmake_policy(SET CMP0186 OLD)
+
+string(REGEX MATCHALL "^0" out "0000")
+check_output(out "0;0;0;0")
+
+string(REGEX MATCHALL "^0+" out "0000")
+check_output(out "0000")
+
+string(REGEX MATCHALL "^(0|a)" out "0000" )
+check_output(out "0;0;0;0")
+
+string(REGEX MATCHALL "^(0|a)" out "aaaa")
+check_output(out "a;a;a;a")
+
+string(REGEX MATCHALL "^(0|a)" out "a0a0")
+check_output(out "a;0;a;0")
+
+string(REGEX MATCHALL "(^|a)0" out "00a0")
+check_output(out "0;0;a0")
+
+string(REGEX REPLACE "^0" "" out "0000")
+check_output(out "")
+
+string(REGEX REPLACE "^0" "x" out "0000")
+check_output(out "xxxx")
+
+string(REGEX REPLACE "^0+" "x" out "0000")
+check_output(out "x")
+
+string(REGEX REPLACE "^(0|a)" "x" out "0000")
+check_output(out "xxxx")
+
+string(REGEX REPLACE "^(0|a)" "x" out "aaaa")
+check_output(out "xxxx")
+
+string(REGEX REPLACE "^(0|a)" "x" out "a0a0")
+check_output(out "xxxx")
+
+string(REGEX REPLACE "(^|a)0" "x" out "00a0")
+check_output(out "xxx")
+
+# NEW, same cases as above
+cmake_policy(SET CMP0186 NEW)
+
+string(REGEX MATCHALL "^0" out "0000")
+check_output(out "0")
+
+string(REGEX MATCHALL "^0+" out "0000")
+check_output(out "0000")
+
+string(REGEX MATCHALL "^(0|a)" out "0000")
+check_output(out "0")
+
+string(REGEX MATCHALL "^(0|a)" out "aaaa")
+check_output(out "a")
+
+string(REGEX MATCHALL "^(0|a)" out "a0a0")
+check_output(out "a")
+
+string(REGEX MATCHALL "(^|a)0" out "00a0")
+check_output(out "0;a0")
+
+string(REGEX REPLACE "^0" "" out "0000")
+check_output(out "000")
+
+string(REGEX REPLACE "^0" "x" out "0000")
+check_output(out "x000")
+
+string(REGEX REPLACE "^0+" "x" out "0000")
+check_output(out "x")
+
+string(REGEX REPLACE "^(0|a)" "x" out "0000")
+check_output(out "x000")
+
+string(REGEX REPLACE "^(0|a)" "x" out "aaaa")
+check_output(out "xaaa")
+
+string(REGEX REPLACE "^(0|a)" "x" out "a0a0")
+check_output(out "x0a0")
+
+string(REGEX REPLACE "(^|a)0" "x" out "00a0")
+check_output(out "x0x")

+ 10 - 10
Tests/RunCMake/string/RegexMultiMatchClear-stderr.txt

@@ -1,12 +1,12 @@
-^matches: Some::;Scope
+^matches: Some::
 results from: string\(REGEX MATCHALL\)
-CMAKE_MATCH_0: -->Scope<--
-CMAKE_MATCH_1: -->Scope<--
-CMAKE_MATCH_2: --><--
-CMAKE_MATCH_COUNT: -->1<--
-replace: \[Some\]\[Scope\]
+CMAKE_MATCH_0: -->Some::<--
+CMAKE_MATCH_1: -->Some<--
+CMAKE_MATCH_2: -->::<--
+CMAKE_MATCH_COUNT: -->2<--
+replace: \[Some\]Scope
 results from: string\(REGEX REPLACE\)
-CMAKE_MATCH_0: -->Scope<--
-CMAKE_MATCH_1: -->Scope<--
-CMAKE_MATCH_2: --><--
-CMAKE_MATCH_COUNT: -->1<--$
+CMAKE_MATCH_0: -->Some::<--
+CMAKE_MATCH_1: -->Some<--
+CMAKE_MATCH_2: -->::<--
+CMAKE_MATCH_COUNT: -->2<--$

+ 2 - 0
Tests/RunCMake/string/RegexMultiMatchClear.cmake

@@ -1,3 +1,5 @@
+cmake_policy(SET CMP0186 NEW)
+
 function (output_results msg)
   message("results from: ${msg}")
   message("CMAKE_MATCH_0: -->${CMAKE_MATCH_0}<--")

+ 1 - 0
Tests/RunCMake/string/RunCMakeTest.cmake

@@ -35,6 +35,7 @@ run_cmake(UuidBadType)
 
 run_cmake(RegexClear)
 run_cmake(RegexMultiMatchClear)
+run_cmake(CMP0186)
 
 run_cmake(UTF-16BE)
 run_cmake(UTF-16LE)