Browse Source

StringAlgorithms: Refactor `cmTokenize()` function

- Refactor and optimize the loop to make it shorter and faster
- Make it push elements into an arbitrary (templated) output iterator
- Make it a template on a separator type with the most used defaults
- Add a backward compatible signature to return `std::vector<std::string>`
- Add an alternative function `cmTokenizedView()` to return a vector of string views
Alex Turbov 1 year ago
parent
commit
f3f70c2f90

+ 2 - 3
Source/CPack/WiX/cmWIXAccessControlList.cxx

@@ -50,14 +50,13 @@ void cmWIXAccessControlList::CreatePermissionElement(std::string const& entry)
     user = user_and_domain;
   }
 
-  std::vector<std::string> permissions = cmTokenize(permission_string, ",");
-
   this->SourceWriter.BeginElement("Permission");
   this->SourceWriter.AddAttribute("User", std::string(user));
   if (!domain.empty()) {
     this->SourceWriter.AddAttribute("Domain", std::string(domain));
   }
-  for (std::string const& permission : permissions) {
+  for (auto permission :
+       cmTokenizedView(permission_string, ',', cmTokenizerMode::New)) {
     this->EmitBooleanAttribute(entry, cmTrimWhitespace(permission));
   }
   this->SourceWriter.EndElement("Permission");

+ 1 - 1
Source/CPack/cmCPackInnoSetupGenerator.cxx

@@ -980,7 +980,7 @@ bool cmCPackInnoSetupGenerator::BuildDownloadedComponentArchive(
                   "Problem running certutil command: " << hashCmd
                                                        << std::endl);
   }
-  *hash = cmTrimWhitespace(cmTokenize(hashOutput, "\n").at(1));
+  *hash = cmTrimWhitespace(cmTokenizedView(hashOutput, '\n').at(1));
 
   if (hash->length() != 64) {
     cmCPackLogger(cmCPackLog::LOG_WARNING,

+ 9 - 7
Source/cmComputeLinkDepends.cxx

@@ -257,7 +257,7 @@ const LinkLibraryFeatureAttributeSet& GetLinkLibraryFeatureAttributes(
         if (processingOption.match(1) == "LIBRARY_TYPE") {
           featureAttributes.LibraryTypes.clear();
           for (auto const& value :
-               cmTokenize(processingOption.match(2), ","_s)) {
+               cmTokenize(processingOption.match(2), ',')) {
             if (value == "STATIC") {
               featureAttributes.LibraryTypes.emplace(
                 cmStateEnums::STATIC_LIBRARY);
@@ -292,7 +292,8 @@ const LinkLibraryFeatureAttributeSet& GetLinkLibraryFeatureAttributes(
           }
         } else if (processingOption.match(1) == "OVERRIDE") {
           featureAttributes.Override.clear();
-          auto values = cmTokenize(processingOption.match(2), ","_s);
+          std::vector<std::string> values =
+            cmTokenize(processingOption.match(2), ',');
           featureAttributes.Override.insert(values.begin(), values.end());
         }
       } else {
@@ -668,13 +669,14 @@ cmComputeLinkDepends::cmComputeLinkDepends(const cmGeneratorTarget* target,
       *linkLibraryOverride, target->GetLocalGenerator(), config, target, &dag,
       target, linkLanguage);
 
-    auto overrideList = cmTokenize(overrideValue, ","_s);
+    std::vector<std::string> overrideList =
+      cmTokenize(overrideValue, ',', cmTokenizerMode::New);
     if (overrideList.size() >= 2) {
       auto const& feature = overrideList.front();
-      for_each(overrideList.cbegin() + 1, overrideList.cend(),
-               [this, &feature](std::string const& item) {
-                 this->LinkLibraryOverride.emplace(item, feature);
-               });
+      std::for_each(overrideList.cbegin() + 1, overrideList.cend(),
+                    [this, &feature](std::string const& item) {
+                      this->LinkLibraryOverride.emplace(item, feature);
+                    });
     }
   }
 }

+ 3 - 3
Source/cmGeneratorTarget_Options.cxx

@@ -565,11 +565,11 @@ std::vector<BT<std::string>>& cmGeneratorTarget::ResolveLinkerWrapper(
       cmSystemTools::ParseUnixCommandLine(
         value.c_str() + LINKER_SHELL.length(), linkerOptions);
     } else {
-      linkerOptions = cmTokenize(value.substr(LINKER.length()), ",");
+      linkerOptions =
+        cmTokenize(value.substr(LINKER.length()), ',', cmTokenizerMode::New);
     }
 
-    if (linkerOptions.empty() ||
-        (linkerOptions.size() == 1 && linkerOptions.front().empty())) {
+    if (linkerOptions.empty()) {
       continue;
     }
 

+ 4 - 3
Source/cmGlobalVisualStudio10Generator.cxx

@@ -339,12 +339,13 @@ bool cmGlobalVisualStudio10Generator::SetGeneratorToolset(
 bool cmGlobalVisualStudio10Generator::ParseGeneratorToolset(
   std::string const& ts, cmMakefile* mf)
 {
-  std::vector<std::string> const fields = cmTokenize(ts, ",");
-  auto fi = fields.begin();
-  if (fi == fields.end()) {
+  std::vector<std::string> const fields =
+    cmTokenize(ts, ',', cmTokenizerMode::New);
+  if (fields.empty()) {
     return true;
   }
 
+  auto fi = fields.begin();
   // The first field may be the VS platform toolset.
   if (fi->find('=') == fi->npos) {
     this->GeneratorToolset = *fi;

+ 4 - 3
Source/cmGlobalVisualStudio8Generator.cxx

@@ -128,12 +128,13 @@ bool cmGlobalVisualStudio8Generator::ParseGeneratorPlatform(
 {
   this->GeneratorPlatform.clear();
 
-  std::vector<std::string> const fields = cmTokenize(p, ",");
-  auto fi = fields.begin();
-  if (fi == fields.end()) {
+  std::vector<std::string> const fields =
+    cmTokenize(p, ',', cmTokenizerMode::New);
+  if (fields.empty()) {
     return true;
   }
 
+  auto fi = fields.begin();
   // The first field may be the VS platform.
   if (fi->find('=') == fi->npos) {
     this->GeneratorPlatform = *fi;

+ 4 - 3
Source/cmGlobalVisualStudioVersionedGenerator.cxx

@@ -584,12 +584,13 @@ bool cmGlobalVisualStudioVersionedGenerator::ParseGeneratorInstance(
   this->GeneratorInstance.clear();
   this->GeneratorInstanceVersion.clear();
 
-  std::vector<std::string> const fields = cmTokenize(is, ",");
-  auto fi = fields.begin();
-  if (fi == fields.end()) {
+  std::vector<std::string> const fields =
+    cmTokenize(is, ',', cmTokenizerMode::New);
+  if (fields.empty()) {
     return true;
   }
 
+  auto fi = fields.begin();
   // The first field may be the VS instance.
   if (fi->find('=') == fi->npos) {
     this->GeneratorInstance = *fi;

+ 6 - 5
Source/cmGlobalXCodeGenerator.cxx

@@ -355,12 +355,13 @@ bool cmGlobalXCodeGenerator::SetGeneratorToolset(std::string const& ts,
 bool cmGlobalXCodeGenerator::ParseGeneratorToolset(std::string const& ts,
                                                    cmMakefile* mf)
 {
-  std::vector<std::string> const fields = cmTokenize(ts, ",");
-  auto fi = fields.cbegin();
-  if (fi == fields.cend()) {
+  std::vector<std::string> const fields =
+    cmTokenize(ts, ',', cmTokenizerMode::New);
+  if (fields.empty()) {
     return true;
   }
 
+  auto fi = fields.cbegin();
   // The first field may be the Xcode GCC_VERSION.
   if (fi->find('=') == fi->npos) {
     this->GeneratorToolset = *fi;
@@ -4457,7 +4458,7 @@ cmXCodeObject* cmGlobalXCodeGenerator::CreateOrGetPBXGroup(
   if (it != this->TargetGroup.end()) {
     tgroup = it->second;
   } else {
-    std::vector<std::string> tgt_folders = cmTokenize(target, "/");
+    std::vector<std::string> const tgt_folders = cmTokenize(target, '/');
     std::string curr_tgt_folder;
     for (std::vector<std::string>::size_type i = 0; i < tgt_folders.size();
          i++) {
@@ -4490,7 +4491,7 @@ cmXCodeObject* cmGlobalXCodeGenerator::CreateOrGetPBXGroup(
   // It's a recursive folder structure, let's find the real parent group
   if (sg->GetFullName() != sg->GetName()) {
     std::string curr_folder = cmStrCat(target, '/');
-    for (auto const& folder : cmTokenize(sg->GetFullName(), "\\")) {
+    for (auto const& folder : cmTokenize(sg->GetFullName(), '\\')) {
       curr_folder += folder;
       auto const i_folder = this->GroupNameMap.find(curr_folder);
       // Create new folder

+ 3 - 7
Source/cmMakefile.cxx

@@ -2387,13 +2387,9 @@ cmSourceGroup* cmMakefile::GetOrCreateSourceGroup(
 
 cmSourceGroup* cmMakefile::GetOrCreateSourceGroup(const std::string& name)
 {
-  std::string delimiters;
-  if (cmValue p = this->GetDefinition("SOURCE_GROUP_DELIMITER")) {
-    delimiters = *p;
-  } else {
-    delimiters = "/\\";
-  }
-  return this->GetOrCreateSourceGroup(cmTokenize(name, delimiters));
+  auto p = this->GetDefinition("SOURCE_GROUP_DELIMITER");
+  return this->GetOrCreateSourceGroup(
+    cmTokenize(name, p ? cm::string_view(*p) : R"(\/)"_s));
 }
 
 /**

+ 16 - 24
Source/cmSourceGroupCommand.cxx

@@ -29,11 +29,6 @@ const std::string kFilesOptionName = "FILES";
 const std::string kRegexOptionName = "REGULAR_EXPRESSION";
 const std::string kSourceGroupOptionName = "<sg_name>";
 
-std::vector<std::string> tokenizePath(const std::string& path)
-{
-  return cmTokenize(path, "\\/");
-}
-
 std::set<std::string> getSourceGroupFilesPaths(
   const std::string& root, const std::vector<std::string>& files)
 {
@@ -95,31 +90,28 @@ bool addFilesToItsSourceGroups(const std::string& root,
   cmSourceGroup* sg;
 
   for (std::string const& sgFilesPath : sgFilesPaths) {
+    std::vector<std::string> tokenizedPath = cmTokenize(
+      prefix.empty() ? sgFilesPath : cmStrCat(prefix, '/', sgFilesPath),
+      R"(\/)", cmTokenizerMode::New);
 
-    std::vector<std::string> tokenizedPath;
-    if (!prefix.empty()) {
-      tokenizedPath = tokenizePath(cmStrCat(prefix, '/', sgFilesPath));
-    } else {
-      tokenizedPath = tokenizePath(sgFilesPath);
+    if (tokenizedPath.empty()) {
+      continue;
     }
+    tokenizedPath.pop_back();
 
-    if (!tokenizedPath.empty()) {
-      tokenizedPath.pop_back();
-
-      if (tokenizedPath.empty()) {
-        tokenizedPath.emplace_back();
-      }
+    if (tokenizedPath.empty()) {
+      tokenizedPath.emplace_back();
+    }
 
-      sg = makefile.GetOrCreateSourceGroup(tokenizedPath);
+    sg = makefile.GetOrCreateSourceGroup(tokenizedPath);
 
-      if (!sg) {
-        errorMsg = "Could not create source group for file: " + sgFilesPath;
-        return false;
-      }
-      const std::string fullPath =
-        cmSystemTools::CollapseFullPath(sgFilesPath, root);
-      sg->AddGroupFile(fullPath);
+    if (!sg) {
+      errorMsg = "Could not create source group for file: " + sgFilesPath;
+      return false;
     }
+    const std::string fullPath =
+      cmSystemTools::CollapseFullPath(sgFilesPath, root);
+    sg->AddGroupFile(fullPath);
   }
 
   return true;

+ 0 - 24
Source/cmStringAlgorithms.cxx

@@ -55,30 +55,6 @@ std::string cmEscapeQuotes(cm::string_view str)
   return result;
 }
 
-std::vector<std::string> cmTokenize(cm::string_view str, cm::string_view sep)
-{
-  std::vector<std::string> tokens;
-  cm::string_view::size_type tokend = 0;
-
-  do {
-    cm::string_view::size_type tokstart = str.find_first_not_of(sep, tokend);
-    if (tokstart == cm::string_view::npos) {
-      break; // no more tokens
-    }
-    tokend = str.find_first_of(sep, tokstart);
-    if (tokend == cm::string_view::npos) {
-      tokens.emplace_back(str.substr(tokstart));
-    } else {
-      tokens.emplace_back(str.substr(tokstart, tokend - tokstart));
-    }
-  } while (tokend != cm::string_view::npos);
-
-  if (tokens.empty()) {
-    tokens.emplace_back();
-  }
-  return tokens;
-}
-
 namespace {
 template <std::size_t N, typename T>
 inline void MakeDigits(cm::string_view& view, char (&digits)[N],

+ 82 - 9
Source/cmStringAlgorithms.h

@@ -24,7 +24,7 @@ using cmStringRange = cmRange<std::vector<std::string>::const_iterator>;
 
 /** Returns length of a literal string.  */
 template <size_t N>
-constexpr size_t cmStrLen(const char (&/*str*/)[N])
+constexpr size_t cmStrLen(const char (&)[N])
 {
   return N - 1;
 }
@@ -91,12 +91,12 @@ std::string cmJoinStrings(Range const& rng, cm::string_view separator,
   }
 
   std::string result;
-  result.reserve(
-    std::accumulate(std::begin(rng), std::end(rng),
-                    initial.size() + (rng.size() - 1) * separator.size(),
-                    [](std::size_t sum, const std::string& item) {
-                      return sum + item.size();
-                    }));
+  result.reserve(std::accumulate(
+    std::begin(rng), std::end(rng),
+    initial.size() + (rng.size() - 1) * separator.size(),
+    [](std::size_t sum, typename Range::value_type const& item) {
+      return sum + item.size();
+    }));
   result.append(std::begin(initial), std::end(initial));
 
   auto begin = std::begin(rng);
@@ -122,8 +122,81 @@ std::string cmJoin(std::vector<std::string> const& rng,
 std::string cmJoin(cmStringRange const& rng, cm::string_view separator,
                    cm::string_view initial = {});
 
-/** Extract tokens that are separated by any of the characters in @a sep.  */
-std::vector<std::string> cmTokenize(cm::string_view str, cm::string_view sep);
+enum class cmTokenizerMode
+{
+  /// A backward-compatible behavior when in the case of no
+  /// tokens have found in an input text it'll return one empty
+  /// token in the result container (vector).
+  Legacy,
+  /// The new behavior is to return an empty vector.
+  New
+};
+
+/**
+ * \brief A generic version of a tokenizer.
+ *
+ * Extract tokens from the input string separated by any
+ * of the characters in `sep` and assign them to the
+ * given output iterator.
+ *
+ * The `mode` parameter defines the behavior in the case when
+ * no tokens have found in the input text.
+ *
+ */
+template <typename StringT, typename OutIt, typename Sep = char>
+void cmTokenize(OutIt outIt, cm::string_view str, Sep sep,
+                cmTokenizerMode mode)
+{
+  auto hasTokens = false;
+  // clang-format off
+  for (auto start = str.find_first_not_of(sep)
+    , end = str.find_first_of(sep, start)
+    ; start != cm::string_view::npos
+    ; start = str.find_first_not_of(sep, end)
+    , end = str.find_first_of(sep, start)
+    , hasTokens = true
+    ) {
+    *outIt++ = StringT{ str.substr(start, end - start) };
+  }
+  // clang-format on
+  if (!hasTokens && mode == cmTokenizerMode::Legacy) {
+    *outIt = {};
+  }
+}
+
+/**
+ * \brief Extract tokens that are separated by any of the
+ * characters in `sep`.
+ *
+ * Backward compatible signature.
+ *
+ * \return A vector of strings.
+ */
+template <typename Sep = char>
+std::vector<std::string> cmTokenize(
+  cm::string_view str, Sep sep, cmTokenizerMode mode = cmTokenizerMode::Legacy)
+{
+  using StringType = std::string;
+  std::vector<StringType> tokens;
+  cmTokenize<StringType>(std::back_inserter(tokens), str, sep, mode);
+  return tokens;
+}
+
+/**
+ * \brief Extract tokens that are separated by any of the
+ * characters in `sep`.
+ *
+ * \return A vector of string views.
+ */
+template <typename Sep = char>
+std::vector<cm::string_view> cmTokenizedView(
+  cm::string_view str, Sep sep, cmTokenizerMode mode = cmTokenizerMode::Legacy)
+{
+  using StringType = cm::string_view;
+  std::vector<StringType> tokens;
+  cmTokenize<StringType>(std::back_inserter(tokens), str, sep, mode);
+  return tokens;
+}
 
 /** Concatenate string pieces into a single string.  */
 std::string cmCatViews(

+ 2 - 2
Source/cmake.cxx

@@ -1131,7 +1131,7 @@ void cmake::SetArgs(const std::vector<std::string>& args)
       "--debug-find-pkg", "Provide a package argument for --debug-find-pkg",
       CommandArgument::Values::One, CommandArgument::RequiresSeparator::Yes,
       [](std::string const& value, cmake* state) -> bool {
-        std::vector<std::string> find_pkgs(cmTokenize(value, ","));
+        std::vector<std::string> find_pkgs(cmTokenize(value, ','));
         std::cout << "Running with debug output on for the 'find' commands "
                      "for package(s)";
         for (auto const& v : find_pkgs) {
@@ -1145,7 +1145,7 @@ void cmake::SetArgs(const std::vector<std::string>& args)
       "--debug-find-var", CommandArgument::Values::One,
       CommandArgument::RequiresSeparator::Yes,
       [](std::string const& value, cmake* state) -> bool {
-        std::vector<std::string> find_vars(cmTokenize(value, ","));
+        std::vector<std::string> find_vars(cmTokenize(value, ','));
         std::cout << "Running with debug output on for the variable(s)";
         for (auto const& v : find_vars) {
           std::cout << ' ' << v;

+ 20 - 1
Tests/CMakeLib/testStringAlgorithms.cxx

@@ -10,6 +10,7 @@
 #include <vector>
 
 #include <cm/string_view>
+#include <cmext/string_view>
 
 #include "cmStringAlgorithms.h"
 
@@ -103,7 +104,9 @@ int testStringAlgorithms(int /*unused*/, char* /*unused*/[])
   {
     typedef std::vector<std::string> VT;
     assert_ok(cmTokenize("", ";") == VT{ "" }, "cmTokenize empty");
-    assert_ok(cmTokenize(";", ";") == VT{ "" }, "cmTokenize sep");
+    assert_ok(cmTokenize(";", ";") == VT{ "" }, "cmTokenize sep (char*)");
+    assert_ok(cmTokenize(";", ";"_s) == VT{ "" },
+              "cmTokenize sep (string_view)");
     assert_ok(cmTokenize("abc", ";") == VT{ "abc" }, "cmTokenize item");
     assert_ok(cmTokenize("abc;", ";") == VT{ "abc" }, "cmTokenize item sep");
     assert_ok(cmTokenize(";abc", ";") == VT{ "abc" }, "cmTokenize sep item");
@@ -112,6 +115,22 @@ int testStringAlgorithms(int /*unused*/, char* /*unused*/[])
     assert_ok(cmTokenize("a1;a2;a3;a4", ";") == VT{ "a1", "a2", "a3", "a4" },
               "cmTokenize multiple items");
   }
+  {
+    typedef std::vector<cm::string_view> VT;
+    assert_ok(cmTokenizedView("", ';') == VT{ "" }, "cmTokenizedView empty");
+    assert_ok(cmTokenizedView(";", ';') == VT{ "" }, "cmTokenizedView sep");
+    assert_ok(cmTokenizedView("abc", ';') == VT{ "abc" },
+              "cmTokenizedView item");
+    assert_ok(cmTokenizedView("abc;", ';') == VT{ "abc" },
+              "cmTokenizedView item sep");
+    assert_ok(cmTokenizedView(";abc", ';') == VT{ "abc" },
+              "cmTokenizedView sep item");
+    assert_ok(cmTokenizedView("abc;;efg", ';') == VT{ "abc", "efg" },
+              "cmTokenizedView item sep sep item");
+    assert_ok(cmTokenizedView("a1;a2;a3;a4", ';') ==
+                VT{ "a1", "a2", "a3", "a4" },
+              "cmTokenizedView multiple items");
+  }
 
   // ----------------------------------------------------------------------
   // Test cmStrCat