Browse Source

string: Allow zero-length matches in all REGEX subcommands

The semantics mimic other languages like Python, Java, JS, etc.
To advance past a zero-length match, the search algorithm first
tries to find a non-zero alternative branch. If that fails, it
force-advances by 1 character.

Fixes: #13790, #13792, #18690, #26629
Nikita Nemkin 10 months ago
parent
commit
8d455809b0

+ 3 - 0
Help/command/string.rst

@@ -122,6 +122,9 @@ Search and Replace With Regular Expressions
   string instead of the beginning of each repeated search.
   See policy :policy:`CMP0186`.
 
+  Zero-length matches are allowed in ``MATCHALL`` and ``REPLACE``.
+  Previously, they triggered an error.
+
   The replacement expression may contain references to subexpressions that
   didn't match anything. Previously, such references triggered an error.
 

+ 2 - 0
Help/release/dev/regex-fixes.rst

@@ -6,3 +6,5 @@ regex-fixes
 
 * References to unmatched groups are allowed, they are replaced with empty
   strings.
+
+* Zero-length matches are always allowed.

+ 14 - 20
Source/cmStringCommand.cxx

@@ -251,15 +251,7 @@ bool RegexMatch(std::vector<std::string> const& args,
   std::string output;
   if (re.find(input)) {
     status.GetMakefile().StoreMatches(re);
-    std::string::size_type l = re.start();
-    std::string::size_type r = re.end();
-    if (r - l == 0) {
-      std::string e = "sub-command REGEX, mode MATCH regex \"" + regex +
-        "\" matched an empty string.";
-      status.SetError(e);
-      return false;
-    }
-    output = input.substr(l, r - l);
+    output = re.match();
   }
 
   // Store the output in the provided variable.
@@ -298,22 +290,24 @@ bool RegexMatchAll(std::vector<std::string> const& args,
   // Scan through the input for all matches.
   std::string output;
   std::string::size_type base = 0;
-  while (re.find(input, base, optAnchor)) {
+  unsigned optNonEmpty = 0;
+  while (re.find(input, base, optAnchor | optNonEmpty)) {
     status.GetMakefile().ClearMatches();
     status.GetMakefile().StoreMatches(re);
-    std::string::size_type l = re.start();
-    std::string::size_type r = re.end();
-    if (r - l == 0) {
-      std::string e = "sub-command REGEX, mode MATCHALL regex \"" + regex +
-        "\" matched an empty string.";
-      status.SetError(e);
-      return false;
-    }
-    if (!output.empty()) {
+    if (!output.empty() || optNonEmpty) {
       output += ";";
     }
     output += re.match();
-    base = r;
+    base = re.end();
+
+    if (re.start() == input.length()) {
+      break;
+    }
+    if (re.start() == re.end()) {
+      optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET;
+    } else {
+      optNonEmpty = 0;
+    }
   }
 
   // Store the output in the provided variable.

+ 17 - 16
Source/cmStringReplaceHelper.cxx

@@ -33,25 +33,17 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
   }
 
   // Scan through the input for all matches.
+  auto& re = this->RegularExpression;
   std::string::size_type base = 0;
-  while (this->RegularExpression.find(input, base, optAnchor)) {
+  unsigned optNonEmpty = 0;
+  while (re.find(input, base, optAnchor | optNonEmpty)) {
     if (this->Makefile) {
       this->Makefile->ClearMatches();
-      this->Makefile->StoreMatches(this->RegularExpression);
+      this->Makefile->StoreMatches(re);
     }
-    auto l2 = this->RegularExpression.start();
-    auto r = this->RegularExpression.end();
 
     // Concatenate the part of the input that was not matched.
-    output += input.substr(base, l2 - base);
-
-    // Make sure the match had some text.
-    if (r - l2 == 0) {
-      std::ostringstream error;
-      error << "regex \"" << this->RegExString << "\" matched an empty string";
-      this->ErrorString = error.str();
-      return false;
-    }
+    output += input.substr(base, re.start() - base);
 
     // Concatenate the replacement for the match.
     for (auto const& replacement : this->Replacements) {
@@ -61,7 +53,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
       } else {
         // Replace with part of the match.
         auto n = replacement.Number;
-        if (n > this->RegularExpression.num_groups()) {
+        if (n > re.num_groups()) {
           std::ostringstream error;
           error << "replace expression \"" << this->ReplaceExpression
                 << "\" contains an out-of-range escape for regex \""
@@ -69,12 +61,21 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
           this->ErrorString = error.str();
           return false;
         }
-        output += this->RegularExpression.match(n);
+        output += re.match(n);
       }
     }
 
     // Move past the match.
-    base = r;
+    base = re.end();
+
+    if (re.start() == input.length()) {
+      break;
+    }
+    if (re.start() == re.end()) {
+      optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET;
+    } else {
+      optNonEmpty = 0;
+    }
   }
 
   // Concatenate the text after the last match.

+ 1 - 1
Tests/CMakeTests/StringTest.cmake.in

@@ -84,7 +84,7 @@ check_cmake_test(String
 # Execute each test listed in StringTestScript.cmake:
 #
 set(scriptname "@CMAKE_CURRENT_SOURCE_DIR@/StringTestScript.cmake")
-set(number_of_tests_expected 73)
+set(number_of_tests_expected 70)
 
 include("@CMAKE_CURRENT_SOURCE_DIR@/ExecuteScriptTests.cmake")
 execute_all_script_tests(${scriptname} number_of_tests_executed)

+ 0 - 9
Tests/CMakeTests/StringTestScript.cmake

@@ -73,9 +73,6 @@ elseif(testname STREQUAL regex_match_multiple_inputs) # pass
 elseif(testname STREQUAL regex_match_bad_regex) # fail
   string(REGEX MATCH "(.*" v input)
 
-elseif(testname STREQUAL regex_match_empty_string) # fail
-  string(REGEX MATCH "x*" v "")
-
 elseif(testname STREQUAL regex_match_no_match) # pass
   string(REGEX MATCH "xyz" v "abc")
   message(STATUS "v='${v}'")
@@ -87,9 +84,6 @@ elseif(testname STREQUAL regex_matchall_multiple_inputs) # pass
 elseif(testname STREQUAL regex_matchall_bad_regex) # fail
   string(REGEX MATCHALL "(.*" v input)
 
-elseif(testname STREQUAL regex_matchall_empty_string) # fail
-  string(REGEX MATCHALL "x*" v "")
-
 elseif(testname STREQUAL regex_replace_ends_with_backslash) # fail
   string(REGEX REPLACE "input" "output\\" v input1 input2 input3 input4)
 
@@ -107,9 +101,6 @@ elseif(testname STREQUAL regex_replace_has_bogus_escape) # fail
 elseif(testname STREQUAL regex_replace_bad_regex) # fail
   string(REGEX REPLACE "this (.*" "with that" v input)
 
-elseif(testname STREQUAL regex_replace_empty_string) # fail
-  string(REGEX REPLACE "x*" "that" v "")
-
 elseif(testname STREQUAL regex_replace_index_too_small) # fail
   string(REGEX REPLACE "^this (.*)$" "with \\1 \\-1" v "this input")
 

+ 143 - 0
Tests/RunCMake/string/RegexEmptyMatch.cmake

@@ -0,0 +1,143 @@
+cmake_policy(SET CMP0186 NEW)
+
+function(check_output name expected)
+  set(output "${${name}}")
+  if(NOT output STREQUAL expected)
+    message(FATAL_ERROR "\"string(REGEX)\" set ${name} to \"${output}\", expected \"${expected}\"")
+  endif()
+endfunction()
+
+# Zero-length matches in REGEX MATCH
+
+string(REGEX MATCH "" out "")
+check_output(out "")
+
+string(REGEX MATCH "" out "a")
+check_output(out "")
+
+string(REGEX MATCH "a*" out "")
+check_output(out "")
+
+string(REGEX MATCH "a*" out "a")
+check_output(out "a")
+
+string(REGEX MATCH "a*" out "b")
+check_output(out "")
+
+string(REGEX MATCH "a*" out "ba")
+check_output(out "")
+
+# Zero-length matches in REGEX MATCHALL
+
+string(REGEX MATCHALL "" out "")
+check_output(out "")
+
+string(REGEX MATCHALL "" out "ab")
+check_output(out ";;")
+
+string(REGEX MATCHALL "^" out "ab")
+check_output(out "")
+
+string(REGEX MATCHALL "(^|,)" out "a,b")
+check_output(out ";,")
+
+string(REGEX MATCHALL "(,|^)" out "a,b")
+check_output(out ";,")
+
+string(REGEX MATCHALL "(^|)" out "")
+check_output(out "")
+
+string(REGEX MATCHALL "(^|)" out "ab")
+check_output(out ";;")
+
+string(REGEX MATCHALL "a|^" out "ab")
+check_output(out "a")
+
+string(REGEX MATCHALL "$" out "ab")
+check_output(out "")
+
+string(REGEX MATCHALL "($|,)" out "a,b")
+check_output(out ",;")
+
+string(REGEX MATCHALL "(,|$)" out "a,b")
+check_output(out ",;")
+
+string(REGEX MATCHALL "(|$)" out "")
+check_output(out "")
+
+string(REGEX MATCHALL "(|$)" out "ab")
+check_output(out ";;")
+
+string(REGEX MATCHALL "(b|)" out "abc")
+check_output(out ";b;;")
+
+string(REGEX MATCHALL "(|b)" out "abc")
+check_output(out ";;b;;")
+
+string(REGEX MATCHALL "a*" out "aaa")
+check_output(out "aaa;")
+
+string(REGEX MATCHALL "(a)?(b)?" out "")
+check_output(out "")
+
+string(REGEX MATCHALL "(a)?(b)?" out "abba")
+check_output(out "ab;b;a;")
+
+# Zero-length matches in REGEX REPLACE
+
+string(REGEX REPLACE "" "" out "")
+check_output(out "")
+
+string(REGEX REPLACE "" "x" out "")
+check_output(out "x")
+
+string(REGEX REPLACE "" "x" out "ab")
+check_output(out "xaxbx")
+
+string(REGEX REPLACE "^" "x" out "ab")
+check_output(out "xab")
+
+string(REGEX REPLACE "(^|,)" "x" out "a,b")
+check_output(out "xaxb")
+
+string(REGEX REPLACE "(,|^)" "x" out "a,b")
+check_output(out "xaxb")
+
+string(REGEX REPLACE "(^|)" "x" out "")
+check_output(out "x")
+
+string(REGEX REPLACE "(^|)" "x" out "ab")
+check_output(out "xaxbx")
+
+string(REGEX REPLACE "a|^" "x" out "ab")
+check_output(out "xb")
+
+string(REGEX REPLACE "$" "x" out "ab")
+check_output(out "abx")
+
+string(REGEX REPLACE "($|,)" "x" out "a,b")
+check_output(out "axbx")
+
+string(REGEX REPLACE "(,|$)" "x" out "a,b")
+check_output(out "axbx")
+
+string(REGEX REPLACE "(|$)" "x" out "")
+check_output(out "x")
+
+string(REGEX REPLACE "(|$)" "x" out "ab")
+check_output(out "xaxbx")
+
+string(REGEX REPLACE "(b|)" "x" out "abc")
+check_output(out "xaxxcx")
+
+string(REGEX REPLACE "(|b)" "x" out "abc")
+check_output(out "xaxxxcx")
+
+string(REGEX REPLACE "a*" "x" out "aaa")
+check_output(out "xx")
+
+string(REGEX REPLACE "(a)?(b)?" "x" out "")
+check_output(out "x")
+
+string(REGEX REPLACE "(a)?(b)?" "x" out "abba")
+check_output(out "xxxx")

+ 1 - 0
Tests/RunCMake/string/RunCMakeTest.cmake

@@ -35,6 +35,7 @@ run_cmake(UuidBadType)
 
 run_cmake(RegexClear)
 run_cmake(RegexMultiMatchClear)
+run_cmake(RegexEmptyMatch)
 run_cmake(CMP0186)
 
 run_cmake(UTF-16BE)