Browse Source

CUDA: Clang separable compilation

For NVCC the compiler takes care of device linking when passed the "-dlink"
flag.
Clang doesn't support such magic and requires the buildsystem to do the work
that NVCC does behind the scenes.

The implementation is based on Bazel's device linking documentation:
https://github.com/tensorflow/tensorflow/blob/7cabcdf073abad8c46e9dda62bb8fa4682d2061e/third_party/nccl/build_defs.bzl.tpl#L259

Closes: #20726
Raul Tambre 5 years ago
parent
commit
c63fe01835

+ 4 - 0
Help/release/dev/cuda-clang-separable-compilation.rst

@@ -0,0 +1,4 @@
+cuda-clang-separable-compilation
+--------------------------------
+
+* :prop_tgt:`CUDA_SEPARABLE_COMPILATION` is now supported when using Clang.

+ 3 - 0
Modules/CMakeCUDACompiler.cmake.in

@@ -3,6 +3,8 @@ set(CMAKE_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@")
 set(CMAKE_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@")
 set(CMAKE_CUDA_COMPILER_ID "@CMAKE_CUDA_COMPILER_ID@")
 set(CMAKE_CUDA_COMPILER_VERSION "@CMAKE_CUDA_COMPILER_VERSION@")
+set(CMAKE_CUDA_DEVICE_LINKER "@CMAKE_CUDA_DEVICE_LINKER@")
+set(CMAKE_CUDA_FATBINARY "@CMAKE_CUDA_FATBINARY@")
 set(CMAKE_CUDA_STANDARD_COMPUTED_DEFAULT "@CMAKE_CUDA_STANDARD_COMPUTED_DEFAULT@")
 set(CMAKE_CUDA_COMPILE_FEATURES "@CMAKE_CUDA_COMPILE_FEATURES@")
 set(CMAKE_CUDA03_COMPILE_FEATURES "@CMAKE_CUDA03_COMPILE_FEATURES@")
@@ -44,6 +46,7 @@ if(CMAKE_CUDA_LIBRARY_ARCHITECTURE)
 endif()
 
 set(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT "@CMAKE_CUDA_COMPILER_TOOLKIT_ROOT@")
+set(CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT "@CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT@")
 set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT "@CMAKE_CUDA_COMPILER_LIBRARY_ROOT@")
 
 set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@")

+ 6 - 1
Modules/CMakeCUDAInformation.cmake

@@ -145,7 +145,7 @@ endif()
 #Specify how to compile when separable compilation has been requested
 if(NOT CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION)
   set(CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
-    "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -dc <SOURCE> -o <OBJECT>")
+    "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} ${_CMAKE_CUDA_DEVICE_CODE} <SOURCE> -o <OBJECT>")
 endif()
 
 #Specify how to compile when whole compilation has been requested
@@ -200,6 +200,11 @@ if(NOT CMAKE_CUDA_DEVICE_LINK_EXECUTABLE)
     "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> ${CMAKE_CUDA_COMPILE_OPTIONS_PIC} ${_CMAKE_CUDA_EXTRA_DEVICE_LINK_FLAGS} -shared -dlink <OBJECTS> -o <TARGET> <LINK_LIBRARIES>${__IMPLICT_DLINK_FLAGS}")
 endif()
 
+# Used when device linking is handled by CMake.
+if(NOT CMAKE_CUDA_DEVICE_LINK_COMPILE)
+  set(CMAKE_CUDA_DEVICE_LINK_COMPILE "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <FLAGS> -D__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ -D__NV_EXTRA_INITIALIZATION=\"\" -D__NV_EXTRA_FINALIZATION=\"\" -DREGISTERLINKBINARYFILE=\\\"<REGISTER_FILE>\\\" -DFATBINFILE=\\\"<FATBINARY>\\\" ${_CMAKE_COMPILE_AS_CUDA_FLAG} -c \"${CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT}/bin/crt/link.stub\" -o <OBJECT>")
+endif()
+
 unset(__IMPLICT_DLINK_FLAGS)
 
 set(CMAKE_CUDA_INFORMATION_LOADED 1)

+ 14 - 2
Modules/CMakeDetermineCUDACompiler.cmake

@@ -169,11 +169,14 @@ if(NOT CMAKE_CUDA_COMPILER_ID_RUN)
     endif()
 
     get_filename_component(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT "${_CUDA_NVCC_EXECUTABLE}" DIRECTORY)
+    set(CMAKE_CUDA_DEVICE_LINKER "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/nvlink${CMAKE_EXECUTABLE_SUFFIX}")
+    set(CMAKE_CUDA_FATBINARY "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/fatbinary${CMAKE_EXECUTABLE_SUFFIX}")
     get_filename_component(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}" DIRECTORY)
 
-    # CMAKE_CUDA_COMPILER_LIBRARY_ROOT contains the device library and version file.
-    # In a non-scattered installation this is equivalent to CMAKE_CUDA_COMPILER_TOOLKIT_ROOT.
+    # In a non-scattered installation the following are equivalent to CMAKE_CUDA_COMPILER_TOOLKIT_ROOT.
     # We first check for a non-scattered installation to prefer it over a scattered installation.
+
+    # CMAKE_CUDA_COMPILER_LIBRARY_ROOT contains the device library and version file.
     if(EXISTS "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/version.txt")
       set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
     elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt")
@@ -181,6 +184,15 @@ if(NOT CMAKE_CUDA_COMPILER_ID_RUN)
     elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt")
       set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT "${CMAKE_SYSROOT}/usr/lib/cuda")
     endif()
+
+    # CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT contains the linking stubs necessary for device linking and other low-level library files.
+    if(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub")
+      set(CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT "${CMAKE_SYSROOT_LINK}/usr/lib/nvidia-cuda-toolkit")
+    elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub")
+      set(CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT "${CMAKE_SYSROOT}/usr/lib/nvidia-cuda-toolkit")
+    else()
+      set(CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
+    endif()
   endif()
 
   set(CMAKE_CUDA_COMPILER_ID_FLAGS_ALWAYS "-v")

+ 1 - 0
Modules/Compiler/Clang-CUDA.cmake

@@ -13,6 +13,7 @@ __compiler_clang_cxx_standards(CUDA)
 set(CMAKE_CUDA_COMPILER_HAS_DEVICE_LINK_PHASE TRUE)
 set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cuda")
 set(_CMAKE_CUDA_PTX_FLAG "--cuda-device-only -S")
+set(_CMAKE_CUDA_DEVICE_CODE "-fgpu-rdc -c")
 
 # RulePlaceholderExpander expands crosscompile variables like sysroot and target only for CMAKE_<LANG>_COMPILER. Override the default.
 set(CMAKE_CUDA_LINK_EXECUTABLE "<CMAKE_CUDA_COMPILER> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>${__IMPLICT_LINKS}")

+ 1 - 0
Modules/Compiler/NVIDIA-CUDA.cmake

@@ -6,6 +6,7 @@ set(CMAKE_CUDA_VERBOSE_COMPILE_FLAG "-Xcompiler=-v")
 
 set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cu")
 set(_CMAKE_CUDA_PTX_FLAG "-ptx")
+set(_CMAKE_CUDA_DEVICE_CODE "-dc")
 
 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2.89)
   # The -forward-unknown-to-host-compiler flag was only

+ 0 - 11
Source/cmLocalGenerator.cxx

@@ -1955,17 +1955,6 @@ void cmLocalGenerator::AddLanguageFlags(std::string& flags,
   } else if (lang == "CUDA") {
     target->AddCUDAArchitectureFlags(flags);
     target->AddCUDAToolkitFlags(flags);
-
-    if (compiler == "Clang") {
-      bool separable = target->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION");
-
-      if (separable) {
-        this->Makefile->IssueMessage(
-          MessageType::FATAL_ERROR,
-          "CUDA_SEPARABLE_COMPILATION isn't supported on Clang. "
-          "See CMake issue #20726.");
-      }
-    }
   } else if (lang == "ISPC") {
     target->AddISPCTargetFlags(flags);
   }

+ 1 - 1
Source/cmLocalGenerator.h

@@ -446,7 +446,7 @@ public:
   void GetTargetCompileFlags(cmGeneratorTarget* target,
                              std::string const& config,
                              std::string const& lang, std::string& flags,
-                             std::string const& arch = std::string());
+                             std::string const& arch);
   std::vector<BT<std::string>> GetTargetCompileFlags(
     cmGeneratorTarget* target, std::string const& config,
     std::string const& lang, std::string const& arch = std::string());

+ 31 - 22
Source/cmMakefileExecutableTargetGenerator.cxx

@@ -91,19 +91,12 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
 
   std::vector<std::string> commands;
 
-  // Get the language to use for linking this library.
-  std::string linkLanguage = "CUDA";
+  // Get the name of the device object to generate.
   std::string const& objExt =
     this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION");
-
-  // Build list of dependencies.
-  std::vector<std::string> depends;
-  this->AppendLinkDepends(depends, linkLanguage);
-
-  // Get the name of the device object to generate.
-  std::string const targetOutputReal =
+  std::string const targetOutput =
     this->GeneratorTarget->ObjectDirectory + "cmake_device_link" + objExt;
-  this->DeviceLinkObject = targetOutputReal;
+  this->DeviceLinkObject = targetOutput;
 
   this->NumberOfProgressActions++;
   if (!this->NoRuleMessages) {
@@ -111,7 +104,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
     this->MakeEchoProgress(progress);
     // Add the link message.
     std::string buildEcho =
-      cmStrCat("Linking ", linkLanguage, " device code ",
+      cmStrCat("Linking CUDA device code ",
                this->LocalGenerator->ConvertToOutputFormat(
                  this->LocalGenerator->MaybeConvertToRelativePath(
                    this->LocalGenerator->GetCurrentBinaryDirectory(),
@@ -121,6 +114,29 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
       commands, buildEcho, cmLocalUnixMakefileGenerator3::EchoLink, &progress);
   }
 
+  if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") {
+    this->WriteDeviceLinkRule(commands, targetOutput);
+  } else {
+    this->WriteNvidiaDeviceExecutableRule(relink, commands, targetOutput);
+  }
+
+  // Write the main driver rule to build everything in this target.
+  this->WriteTargetDriverRule(targetOutput, relink);
+#else
+  static_cast<void>(relink);
+#endif
+}
+
+void cmMakefileExecutableTargetGenerator::WriteNvidiaDeviceExecutableRule(
+  bool relink, std::vector<std::string>& commands,
+  const std::string& targetOutput)
+{
+  const std::string linkLanguage = "CUDA";
+
+  // Build list of dependencies.
+  std::vector<std::string> depends;
+  this->AppendLinkDepends(depends, linkLanguage);
+
   // Build a list of compiler flags and linker flags.
   std::string langFlags;
   std::string linkFlags;
@@ -136,7 +152,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
   // may need to be cleaned.
   std::vector<std::string> exeCleanFiles;
   exeCleanFiles.push_back(this->LocalGenerator->MaybeConvertToRelativePath(
-    this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal));
+    this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput));
 
   // Determine whether a link script will be used.
   bool useLinkScript = this->GlobalGenerator->GetUseLinkScript();
@@ -195,7 +211,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
       : cmOutputConverter::SHELL;
     std::string target = this->LocalGenerator->ConvertToOutputFormat(
       this->LocalGenerator->MaybeConvertToRelativePath(
-        this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal),
+        this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput),
       output);
 
     std::string targetFullPathCompilePDB =
@@ -226,7 +242,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
       this->LocalGenerator->CreateRulePlaceholderExpander());
 
     // Expand placeholders in the commands.
-    rulePlaceholderExpander->SetTargetImpLib(targetOutputReal);
+    rulePlaceholderExpander->SetTargetImpLib(targetOutput);
     for (std::string& real_link_command : real_link_commands) {
       real_link_command = cmStrCat(launcher, real_link_command);
       rulePlaceholderExpander->ExpandRuleVariables(this->LocalGenerator,
@@ -255,17 +271,10 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule(
 
   // Write the build rule.
   this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr,
-                                      targetOutputReal, depends, commands,
-                                      false);
-
-  // Write the main driver rule to build everything in this target.
-  this->WriteTargetDriverRule(targetOutputReal, relink);
+                                      targetOutput, depends, commands, false);
 
   // Clean all the possible executable names and symlinks.
   this->CleanFiles.insert(exeCleanFiles.begin(), exeCleanFiles.end());
-#else
-  static_cast<void>(relink);
-#endif
 }
 
 void cmMakefileExecutableTargetGenerator::WriteExecutableRule(bool relink)

+ 4 - 0
Source/cmMakefileExecutableTargetGenerator.h

@@ -5,6 +5,7 @@
 #include "cmConfigure.h" // IWYU pragma: keep
 
 #include <string>
+#include <vector>
 
 #include "cmMakefileTargetGenerator.h"
 
@@ -23,6 +24,9 @@ public:
 protected:
   virtual void WriteExecutableRule(bool relink);
   virtual void WriteDeviceExecutableRule(bool relink);
+  virtual void WriteNvidiaDeviceExecutableRule(
+    bool relink, std::vector<std::string>& commands,
+    const std::string& targetOutput);
 
 private:
   std::string DeviceLinkObject;

+ 41 - 32
Source/cmMakefileLibraryTargetGenerator.cxx

@@ -129,8 +129,7 @@ void cmMakefileLibraryTargetGenerator::WriteStaticLibraryRules()
   const bool requiresDeviceLinking = requireDeviceLinking(
     *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName());
   if (requiresDeviceLinking) {
-    std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY";
-    this->WriteDeviceLibraryRules(linkRuleVar, false);
+    this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", false);
   }
 
   std::string linkLanguage =
@@ -156,8 +155,7 @@ void cmMakefileLibraryTargetGenerator::WriteSharedLibraryRules(bool relink)
     const bool requiresDeviceLinking = requireDeviceLinking(
       *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName());
     if (requiresDeviceLinking) {
-      std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY";
-      this->WriteDeviceLibraryRules(linkRuleVar, relink);
+      this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", relink);
     }
   }
 
@@ -191,8 +189,7 @@ void cmMakefileLibraryTargetGenerator::WriteModuleLibraryRules(bool relink)
     const bool requiresDeviceLinking = requireDeviceLinking(
       *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName());
     if (requiresDeviceLinking) {
-      std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY";
-      this->WriteDeviceLibraryRules(linkRuleVar, relink);
+      this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", relink);
     }
   }
 
@@ -239,29 +236,13 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
   // TODO: Merge the methods that call this method to avoid
   // code duplication.
   std::vector<std::string> commands;
-
-  // Get the language to use for linking this library.
-  std::string linkLanguage = "CUDA";
   std::string const objExt =
     this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION");
 
-  // Build list of dependencies.
-  std::vector<std::string> depends;
-  this->AppendLinkDepends(depends, linkLanguage);
-
-  // Add language-specific flags.
-  std::string langFlags;
-  this->LocalGenerator->AddLanguageFlagsForLinking(
-    langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName());
-
-  // Create set of linking flags.
-  std::string linkFlags;
-  this->GetDeviceLinkFlags(linkFlags, linkLanguage);
-
   // Get the name of the device object to generate.
-  std::string const targetOutputReal =
+  std::string const targetOutput =
     this->GeneratorTarget->ObjectDirectory + "cmake_device_link" + objExt;
-  this->DeviceLinkObject = targetOutputReal;
+  this->DeviceLinkObject = targetOutput;
 
   this->NumberOfProgressActions++;
   if (!this->NoRuleMessages) {
@@ -269,7 +250,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
     this->MakeEchoProgress(progress);
     // Add the link message.
     std::string buildEcho =
-      cmStrCat("Linking ", linkLanguage, " device code ",
+      cmStrCat("Linking CUDA device code ",
                this->LocalGenerator->ConvertToOutputFormat(
                  this->LocalGenerator->MaybeConvertToRelativePath(
                    this->LocalGenerator->GetCurrentBinaryDirectory(),
@@ -278,10 +259,41 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
     this->LocalGenerator->AppendEcho(
       commands, buildEcho, cmLocalUnixMakefileGenerator3::EchoLink, &progress);
   }
+
+  if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") {
+    this->WriteDeviceLinkRule(commands, targetOutput);
+  } else {
+    this->WriteNvidiaDeviceLibraryRules(linkRuleVar, relink, commands,
+                                        targetOutput);
+  }
+
+  // Write the main driver rule to build everything in this target.
+  this->WriteTargetDriverRule(targetOutput, relink);
+}
+
+void cmMakefileLibraryTargetGenerator::WriteNvidiaDeviceLibraryRules(
+  const std::string& linkRuleVar, bool relink,
+  std::vector<std::string>& commands, const std::string& targetOutput)
+{
+  std::string linkLanguage = "CUDA";
+
+  // Build list of dependencies.
+  std::vector<std::string> depends;
+  this->AppendLinkDepends(depends, linkLanguage);
+
+  // Add language-specific flags.
+  std::string langFlags;
+  this->LocalGenerator->AddLanguageFlagsForLinking(
+    langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName());
+
+  // Create set of linking flags.
+  std::string linkFlags;
+  this->GetDeviceLinkFlags(linkFlags, linkLanguage);
+
   // Clean files associated with this library.
   std::set<std::string> libCleanFiles;
   libCleanFiles.insert(this->LocalGenerator->MaybeConvertToRelativePath(
-    this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal));
+    this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput));
 
   // Determine whether a link script will be used.
   bool useLinkScript = this->GlobalGenerator->GetUseLinkScript();
@@ -335,7 +347,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
 
     std::string target = this->LocalGenerator->ConvertToOutputFormat(
       this->LocalGenerator->MaybeConvertToRelativePath(
-        this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal),
+        this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput),
       output);
 
     std::string targetFullPathCompilePDB =
@@ -364,7 +376,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
       this->LocalGenerator->CreateRulePlaceholderExpander());
 
     // Construct the main link rule and expand placeholders.
-    rulePlaceholderExpander->SetTargetImpLib(targetOutputReal);
+    rulePlaceholderExpander->SetTargetImpLib(targetOutput);
     std::string linkRule = this->GetLinkRule(linkRuleVar);
     cmExpandList(linkRule, real_link_commands);
 
@@ -399,14 +411,11 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules(
   commands1.clear();
 
   // Compute the list of outputs.
-  std::vector<std::string> outputs(1, targetOutputReal);
+  std::vector<std::string> outputs(1, targetOutput);
 
   // Write the build rule.
   this->WriteMakeRule(*this->BuildFileStream, nullptr, outputs, depends,
                       commands, false);
-
-  // Write the main driver rule to build everything in this target.
-  this->WriteTargetDriverRule(targetOutputReal, relink);
 #else
   static_cast<void>(linkRuleVar);
   static_cast<void>(relink);

+ 5 - 0
Source/cmMakefileLibraryTargetGenerator.h

@@ -5,6 +5,7 @@
 #include "cmConfigure.h" // IWYU pragma: keep
 
 #include <string>
+#include <vector>
 
 #include "cmMakefileTargetGenerator.h"
 
@@ -27,6 +28,10 @@ protected:
   void WriteModuleLibraryRules(bool relink);
 
   void WriteDeviceLibraryRules(const std::string& linkRule, bool relink);
+  void WriteNvidiaDeviceLibraryRules(const std::string& linkRuleVar,
+                                     bool relink,
+                                     std::vector<std::string>& commands,
+                                     const std::string& targetOutput);
   void WriteLibraryRules(const std::string& linkRule,
                          const std::string& extraFlags, bool relink);
   // MacOSX Framework support methods

+ 131 - 2
Source/cmMakefileTargetGenerator.cxx

@@ -2,10 +2,13 @@
    file Copyright.txt or https://cmake.org/licensing for details.  */
 #include "cmMakefileTargetGenerator.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstdio>
+#include <iterator>
 #include <sstream>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include <cm/memory>
@@ -25,6 +28,7 @@
 #include "cmMakefileExecutableTargetGenerator.h"
 #include "cmMakefileLibraryTargetGenerator.h"
 #include "cmMakefileUtilityTargetGenerator.h"
+#include "cmMessageType.h"
 #include "cmOutputConverter.h"
 #include "cmPolicies.h"
 #include "cmProperty.h"
@@ -1323,6 +1327,130 @@ void cmMakefileTargetGenerator::WriteObjectDependRules(
   }
 }
 
+void cmMakefileTargetGenerator::WriteDeviceLinkRule(
+  std::vector<std::string>& commands, const std::string& output)
+{
+  std::string architecturesStr =
+    this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES");
+
+  if (cmIsOff(architecturesStr)) {
+    this->Makefile->IssueMessage(MessageType::FATAL_ERROR,
+                                 "CUDA_SEPARABLE_COMPILATION on Clang "
+                                 "requires CUDA_ARCHITECTURES to be set.");
+    return;
+  }
+
+  std::vector<std::string> architectures = cmExpandedList(architecturesStr);
+
+  // Ensure there are no duplicates.
+  const std::vector<std::string> linkDeps = [&]() -> std::vector<std::string> {
+    std::vector<std::string> deps;
+    this->AppendTargetDepends(deps, true);
+    this->GeneratorTarget->GetLinkDepends(deps, this->GetConfigName(), "CUDA");
+    std::copy(this->Objects.begin(), this->Objects.end(),
+              std::back_inserter(deps));
+
+    std::unordered_set<std::string> depsSet(deps.begin(), deps.end());
+    deps.clear();
+    std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps));
+    return deps;
+  }();
+
+  const std::string objectDir = this->GeneratorTarget->ObjectDirectory;
+  const std::string relObjectDir =
+    this->LocalGenerator->MaybeConvertToRelativePath(
+      this->LocalGenerator->GetCurrentBinaryDirectory(), objectDir);
+
+  // Construct a list of files associated with this executable that
+  // may need to be cleaned.
+  std::vector<std::string> cleanFiles;
+  cleanFiles.push_back(this->LocalGenerator->MaybeConvertToRelativePath(
+    this->LocalGenerator->GetCurrentBinaryDirectory(), output));
+
+  std::string profiles;
+  std::vector<std::string> fatbinaryDepends;
+  std::string registerFile = cmStrCat(objectDir, "cmake_cuda_register.h");
+
+  // Link device code for each architecture.
+  for (const std::string& architectureKind : architectures) {
+    // Clang always generates real code, so strip the specifier.
+    const std::string architecture =
+      architectureKind.substr(0, architectureKind.find('-'));
+    const std::string cubin =
+      cmStrCat(relObjectDir, "sm_", architecture, ".cubin");
+
+    profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
+    fatbinaryDepends.emplace_back(cubin);
+
+    std::string registerFileCmd;
+
+    // The generated register file contains macros that when expanded register
+    // the device routines. Because the routines are the same for all
+    // architectures the register file will be the same too. Thus generate it
+    // only on the first invocation to reduce overhead.
+    if (fatbinaryDepends.size() == 1) {
+      std::string registerFileRel =
+        this->LocalGenerator->MaybeConvertToRelativePath(
+          this->LocalGenerator->GetCurrentBinaryDirectory(), registerFile);
+      registerFileCmd =
+        cmStrCat(" --register-link-binaries=", registerFileRel);
+      cleanFiles.push_back(registerFileRel);
+    }
+
+    std::string command = cmStrCat(
+      this->Makefile->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"),
+      " -arch=sm_", architecture, registerFileCmd, " -o=$@ ",
+      cmJoin(linkDeps, " "));
+
+    this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, cubin,
+                                        linkDeps, { command }, false);
+  }
+
+  // Combine all architectures into a single fatbinary.
+  const std::string fatbinaryCommand =
+    cmStrCat(this->Makefile->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"),
+             " -64 -cmdline=--compile-only -compress-all -link "
+             "--embedded-fatbin=$@",
+             profiles);
+  const std::string fatbinaryOutput =
+    cmStrCat(objectDir, "cmake_cuda_fatbin.h");
+  const std::string fatbinaryOutputRel =
+    this->LocalGenerator->MaybeConvertToRelativePath(
+      this->LocalGenerator->GetCurrentBinaryDirectory(), fatbinaryOutput);
+
+  this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr,
+                                      fatbinaryOutputRel, fatbinaryDepends,
+                                      { fatbinaryCommand }, false);
+
+  // Compile the stub that registers the kernels and contains the fatbinaries.
+  cmRulePlaceholderExpander::RuleVariables vars;
+  vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str();
+  vars.CMTargetType =
+    cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str();
+
+  vars.Language = "CUDA";
+  vars.Object = output.c_str();
+  vars.Fatbinary = fatbinaryOutput.c_str();
+  vars.RegisterFile = registerFile.c_str();
+
+  std::string flags = this->GetFlags("CUDA", this->GetConfigName());
+  vars.Flags = flags.c_str();
+
+  std::string compileCmd = this->GetLinkRule("CMAKE_CUDA_DEVICE_LINK_COMPILE");
+  std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander(
+    this->LocalGenerator->CreateRulePlaceholderExpander());
+  rulePlaceholderExpander->ExpandRuleVariables(this->LocalGenerator,
+                                               compileCmd, vars);
+
+  commands.emplace_back(compileCmd);
+  this->LocalGenerator->WriteMakeRule(
+    *this->BuildFileStream, nullptr, output,
+    { cmStrCat(relObjectDir, "cmake_cuda_fatbin.h") }, commands, false);
+
+  // Clean all the possible executable names and symlinks.
+  this->CleanFiles.insert(cleanFiles.begin(), cleanFiles.end());
+}
+
 void cmMakefileTargetGenerator::GenerateCustomRuleFile(
   cmCustomCommandGenerator const& ccg)
 {
@@ -1579,10 +1707,11 @@ void cmMakefileTargetGenerator::WriteTargetDriverRule(
 }
 
 void cmMakefileTargetGenerator::AppendTargetDepends(
-  std::vector<std::string>& depends)
+  std::vector<std::string>& depends, bool ignoreType)
 {
   // Static libraries never depend on anything for linking.
-  if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY) {
+  if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY &&
+      !ignoreType) {
     return;
   }
 

+ 6 - 1
Source/cmMakefileTargetGenerator.h

@@ -104,6 +104,10 @@ protected:
   void WriteObjectDependRules(cmSourceFile const& source,
                               std::vector<std::string>& depends);
 
+  // CUDA device linking.
+  void WriteDeviceLinkRule(std::vector<std::string>& commands,
+                           const std::string& output);
+
   // write the build rule for a custom command
   void GenerateCustomRuleFile(cmCustomCommandGenerator const& ccg);
 
@@ -127,7 +131,8 @@ protected:
   void DriveCustomCommands(std::vector<std::string>& depends);
 
   // append intertarget dependencies
-  void AppendTargetDepends(std::vector<std::string>& depends);
+  void AppendTargetDepends(std::vector<std::string>& depends,
+                           bool ignoreType = false);
 
   // Append object file dependencies.
   void AppendObjectDepends(std::vector<std::string>& depends);

+ 200 - 24
Source/cmNinjaNormalTargetGenerator.cxx

@@ -8,6 +8,7 @@
 #include <map>
 #include <set>
 #include <sstream>
+#include <unordered_set>
 #include <utility>
 
 #include <cm/memory>
@@ -25,6 +26,7 @@
 #include "cmLocalGenerator.h"
 #include "cmLocalNinjaGenerator.h"
 #include "cmMakefile.h"
+#include "cmMessageType.h"
 #include "cmNinjaLinkLineDeviceComputer.h"
 #include "cmNinjaTypes.h"
 #include "cmOSXBundleGenerator.h"
@@ -178,6 +180,33 @@ std::string cmNinjaNormalTargetGenerator::LanguageLinkerDeviceRule(
     "_", config);
 }
 
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceRule(
+  const std::string& config) const
+{
+  return cmStrCat(
+    this->TargetLinkLanguage(config), "_DEVICE_LINK__",
+    cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+    '_', config);
+}
+
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceCompileRule(
+  const std::string& config) const
+{
+  return cmStrCat(
+    this->TargetLinkLanguage(config), "_DEVICE_LINK_COMPILE__",
+    cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+    '_', config);
+}
+
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaFatbinaryRule(
+  const std::string& config) const
+{
+  return cmStrCat(
+    this->TargetLinkLanguage(config), "_FATBINARY__",
+    cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+    '_', config);
+}
+
 struct cmNinjaRemoveNoOpCommands
 {
   bool operator()(std::string const& cmd)
@@ -186,7 +215,7 @@ struct cmNinjaRemoveNoOpCommands
   }
 };
 
-void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule(
+void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkRule(
   bool useResponseFile, const std::string& config)
 {
   cmNinjaRule rule(this->LanguageLinkerDeviceRule(config));
@@ -272,6 +301,55 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule(
   }
 }
 
+void cmNinjaNormalTargetGenerator::WriteDeviceLinkRules(
+  const std::string& config)
+{
+  const cmMakefile* mf = this->GetMakefile();
+
+  cmNinjaRule rule(LanguageLinkerCudaDeviceRule(config));
+  rule.Command = this->GetLocalGenerator()->BuildCommandLine(
+    { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"),
+               " -arch=$ARCH $REGISTER -o=$out $in") });
+  rule.Comment = "Rule for CUDA device linking.";
+  rule.Description = "Linking CUDA $out";
+  this->GetGlobalGenerator()->AddRule(rule);
+
+  cmRulePlaceholderExpander::RuleVariables vars;
+  vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str();
+  vars.CMTargetType =
+    cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str();
+
+  vars.Language = "CUDA";
+  vars.Object = "$out";
+  vars.Fatbinary = "$FATBIN";
+  vars.RegisterFile = "$REGISTER";
+
+  std::string flags = this->GetFlags("CUDA", config);
+  vars.Flags = flags.c_str();
+
+  std::string compileCmd = this->GetMakefile()->GetRequiredDefinition(
+    "CMAKE_CUDA_DEVICE_LINK_COMPILE");
+  std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander(
+    this->GetLocalGenerator()->CreateRulePlaceholderExpander());
+  rulePlaceholderExpander->ExpandRuleVariables(this->GetLocalGenerator(),
+                                               compileCmd, vars);
+
+  rule.Name = LanguageLinkerCudaDeviceCompileRule(config);
+  rule.Command = this->GetLocalGenerator()->BuildCommandLine({ compileCmd });
+  rule.Comment = "Rule for compiling CUDA device stubs.";
+  rule.Description = "Compiling CUDA device stub $out";
+  this->GetGlobalGenerator()->AddRule(rule);
+
+  rule.Name = LanguageLinkerCudaFatbinaryRule(config);
+  rule.Command = this->GetLocalGenerator()->BuildCommandLine(
+    { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"),
+               " -64 -cmdline=--compile-only -compress-all -link "
+               "--embedded-fatbin=$out $PROFILES") });
+  rule.Comment = "Rule for CUDA fatbinaries.";
+  rule.Description = "Creating fatbinary $out";
+  this->GetGlobalGenerator()->AddRule(rule);
+}
+
 void cmNinjaNormalTargetGenerator::WriteLinkRule(bool useResponseFile,
                                                  const std::string& config)
 {
@@ -586,7 +664,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
 
   // First and very important step is to make sure while inside this
   // step our link language is set to CUDA
-  std::string cudaLinkLanguage = "CUDA";
   std::string const& objExt =
     this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION");
 
@@ -598,6 +675,118 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
   std::string targetOutputReal =
     ConvertToNinjaPath(targetOutputDir + "cmake_device_link" + objExt);
 
+  if (firstForConfig) {
+    globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal);
+  }
+  this->DeviceLinkObject = targetOutputReal;
+
+  // Write comments.
+  cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream());
+  this->GetCommonFileStream()
+    << "# Device Link build statements for "
+    << cmState::GetTargetTypeName(genTarget->GetType()) << " target "
+    << this->GetTargetName() << "\n\n";
+
+  if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") {
+    std::string architecturesStr =
+      this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES");
+
+    if (cmIsOff(architecturesStr)) {
+      this->Makefile->IssueMessage(MessageType::FATAL_ERROR,
+                                   "CUDA_SEPARABLE_COMPILATION on Clang "
+                                   "requires CUDA_ARCHITECTURES to be set.");
+      return;
+    }
+
+    this->WriteDeviceLinkRules(config);
+    this->WriteDeviceLinkStatements(config, cmExpandedList(architecturesStr),
+                                    targetOutputReal);
+  } else {
+    this->WriteNvidiaDeviceLinkStatement(config, fileConfig, targetOutputDir,
+                                         targetOutputReal);
+  }
+}
+
+void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements(
+  const std::string& config, const std::vector<std::string>& architectures,
+  const std::string& output)
+{
+  // Ensure there are no duplicates.
+  const cmNinjaDeps explicitDeps = [&]() -> std::vector<std::string> {
+    std::unordered_set<std::string> depsSet;
+    const cmNinjaDeps linkDeps =
+      this->ComputeLinkDeps(this->TargetLinkLanguage(config), config, true);
+    const cmNinjaDeps objects = this->GetObjects(config);
+    depsSet.insert(linkDeps.begin(), linkDeps.end());
+    depsSet.insert(objects.begin(), objects.end());
+
+    std::vector<std::string> deps;
+    std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps));
+    return deps;
+  }();
+
+  const std::string objectDir =
+    cmStrCat(this->GeneratorTarget->GetSupportDirectory(),
+             this->GetGlobalGenerator()->ConfigDirectory(config));
+  const std::string ninjaOutputDir = this->ConvertToNinjaPath(objectDir);
+
+  cmNinjaBuild fatbinary(LanguageLinkerCudaFatbinaryRule(config));
+
+  // Link device code for each architecture.
+  for (const std::string& architectureKind : architectures) {
+    // Clang always generates real code, so strip the specifier.
+    const std::string architecture =
+      architectureKind.substr(0, architectureKind.find('-'));
+    const std::string cubin =
+      cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin");
+
+    fatbinary.Variables["PROFILES"] +=
+      cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
+    fatbinary.ExplicitDeps.emplace_back(cubin);
+
+    cmNinjaBuild dlink(LanguageLinkerCudaDeviceRule(config));
+    dlink.ExplicitDeps = explicitDeps;
+    dlink.Outputs = { cubin };
+    dlink.Variables["ARCH"] = cmStrCat("sm_", architecture);
+
+    // The generated register file contains macros that when expanded register
+    // the device routines. Because the routines are the same for all
+    // architectures the register file will be the same too. Thus generate it
+    // only on the first invocation to reduce overhead.
+    if (fatbinary.ExplicitDeps.size() == 1) {
+      dlink.Variables["REGISTER"] = cmStrCat(
+        "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h");
+    }
+
+    this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink);
+  }
+
+  // Combine all architectures into a single fatbinary.
+  fatbinary.Outputs = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") };
+  this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(),
+                                         fatbinary);
+
+  // Compile the stub that registers the kernels and contains the fatbinaries.
+  cmNinjaBuild dcompile(LanguageLinkerCudaDeviceCompileRule(config));
+  dcompile.Outputs = { output };
+  dcompile.ExplicitDeps = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") };
+  dcompile.Variables["FATBIN"] =
+    this->GetLocalGenerator()->ConvertToOutputFormat(
+      cmStrCat(objectDir, "/cmake_cuda_fatbin.h"), cmOutputConverter::SHELL);
+  dcompile.Variables["REGISTER"] =
+    this->GetLocalGenerator()->ConvertToOutputFormat(
+      cmStrCat(objectDir, "/cmake_cuda_register.h"), cmOutputConverter::SHELL);
+  this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(),
+                                         dcompile);
+}
+
+void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkStatement(
+  const std::string& config, const std::string& fileConfig,
+  const std::string& outputDir, const std::string& output)
+{
+  cmGeneratorTarget* genTarget = this->GetGeneratorTarget();
+  cmGlobalNinjaGenerator* globalGen = this->GetGlobalGenerator();
+
   std::string targetOutputImplib = ConvertToNinjaPath(
     genTarget->GetFullPath(config, cmStateEnums::ImportLibraryArtifact));
 
@@ -606,8 +795,8 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
       cmStrCat(this->GetLocalGenerator()->GetTargetDirectory(genTarget),
                globalGen->ConfigDirectory(fileConfig), "/");
     targetOutputFileConfigDir =
-      globalGen->ExpandCFGIntDir(targetOutputDir, fileConfig);
-    if (targetOutputDir == targetOutputFileConfigDir) {
+      globalGen->ExpandCFGIntDir(outputDir, fileConfig);
+    if (outputDir == targetOutputFileConfigDir) {
       return;
     }
 
@@ -623,27 +812,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
     }
   }
 
-  if (firstForConfig) {
-    globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal);
-  }
-  this->DeviceLinkObject = targetOutputReal;
-
-  // Write comments.
-  cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream());
-  const cmStateEnums::TargetType targetType = genTarget->GetType();
-  this->GetCommonFileStream() << "# Device Link build statements for "
-                              << cmState::GetTargetTypeName(targetType)
-                              << " target " << this->GetTargetName() << "\n\n";
-
   // Compute the comment.
   cmNinjaBuild build(this->LanguageLinkerDeviceRule(config));
   build.Comment =
-    cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', targetOutputReal);
+    cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', output);
 
   cmNinjaVars& vars = build.Variables;
 
   // Compute outputs.
-  build.Outputs.push_back(targetOutputReal);
+  build.Outputs.push_back(output);
   // Compute specific libraries to link with.
   build.ExplicitDeps = this->GetObjects(config);
   build.ImplicitDeps =
@@ -659,7 +836,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
   cmLocalNinjaGenerator& localGen = *this->GetLocalGenerator();
 
   vars["TARGET_FILE"] =
-    localGen.ConvertToOutputFormat(targetOutputReal, cmOutputConverter::SHELL);
+    localGen.ConvertToOutputFormat(output, cmOutputConverter::SHELL);
 
   std::unique_ptr<cmLinkLineComputer> linkLineComputer(
     new cmNinjaLinkLineDeviceComputer(
@@ -683,8 +860,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
 
   // Compute language specific link flags.
   std::string langFlags;
-  localGen.AddLanguageFlagsForLinking(langFlags, genTarget, cudaLinkLanguage,
-                                      config);
+  localGen.AddLanguageFlagsForLinking(langFlags, genTarget, "CUDA", config);
   vars["LANGUAGE_COMPILE_FLAGS"] = langFlags;
 
   auto const tgtNames = this->TargetNames(config);
@@ -692,7 +868,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
     vars["SONAME_FLAG"] =
       this->GetMakefile()->GetSONameFlag(this->TargetLinkLanguage(config));
     vars["SONAME"] = tgtNames.SharedObject;
-    if (targetType == cmStateEnums::SHARED_LIBRARY) {
+    if (genTarget->GetType() == cmStateEnums::SHARED_LIBRARY) {
       std::string install_dir =
         this->GetGeneratorTarget()->GetInstallNameDirForBuildTree(config);
       if (!install_dir.empty()) {
@@ -731,7 +907,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
   // do not check if the user has explicitly forced a response file.
   int const commandLineLengthLimit =
     static_cast<int>(cmSystemTools::CalculateCommandLineLengthLimit()) -
-    globalGen->GetRuleCmdLength(this->LanguageLinkerDeviceRule(config));
+    globalGen->GetRuleCmdLength(build.Rule);
 
   build.RspFile = this->ConvertToNinjaPath(
     cmStrCat("CMakeFiles/", genTarget->GetName(),
@@ -746,7 +922,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
   bool usedResponseFile = false;
   globalGen->WriteBuild(this->GetCommonFileStream(), build,
                         commandLineLengthLimit, &usedResponseFile);
-  this->WriteDeviceLinkRule(usedResponseFile, config);
+  this->WriteNvidiaDeviceLinkRule(usedResponseFile, config);
 }
 
 void cmNinjaNormalTargetGenerator::WriteLinkStatement(

+ 14 - 1
Source/cmNinjaNormalTargetGenerator.h

@@ -21,18 +21,31 @@ public:
 private:
   std::string LanguageLinkerRule(const std::string& config) const;
   std::string LanguageLinkerDeviceRule(const std::string& config) const;
+  std::string LanguageLinkerCudaDeviceRule(const std::string& config) const;
+  std::string LanguageLinkerCudaDeviceCompileRule(
+    const std::string& config) const;
+  std::string LanguageLinkerCudaFatbinaryRule(const std::string& config) const;
 
   const char* GetVisibleTypeName() const;
   void WriteLanguagesRules(const std::string& config);
 
   void WriteLinkRule(bool useResponseFile, const std::string& config);
-  void WriteDeviceLinkRule(bool useResponseFile, const std::string& config);
+  void WriteDeviceLinkRules(const std::string& config);
+  void WriteNvidiaDeviceLinkRule(bool useResponseFile,
+                                 const std::string& config);
 
   void WriteLinkStatement(const std::string& config,
                           const std::string& fileConfig, bool firstForConfig);
   void WriteDeviceLinkStatement(const std::string& config,
                                 const std::string& fileConfig,
                                 bool firstForConfig);
+  void WriteDeviceLinkStatements(const std::string& config,
+                                 const std::vector<std::string>& architectures,
+                                 const std::string& output);
+  void WriteNvidiaDeviceLinkStatement(const std::string& config,
+                                      const std::string& fileConfig,
+                                      const std::string& outputDir,
+                                      const std::string& output);
 
   void WriteObjectLibStatement(const std::string& config);
 

+ 6 - 3
Source/cmNinjaTargetGenerator.cxx

@@ -346,11 +346,13 @@ std::string cmNinjaTargetGenerator::ComputeIncludes(
 }
 
 cmNinjaDeps cmNinjaTargetGenerator::ComputeLinkDeps(
-  const std::string& linkLanguage, const std::string& config) const
+  const std::string& linkLanguage, const std::string& config,
+  bool ignoreType) const
 {
   // Static libraries never depend on other targets for linking.
-  if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY ||
-      this->GeneratorTarget->GetType() == cmStateEnums::OBJECT_LIBRARY) {
+  if (!ignoreType &&
+      (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY ||
+       this->GeneratorTarget->GetType() == cmStateEnums::OBJECT_LIBRARY)) {
     return cmNinjaDeps();
   }
 
@@ -1009,6 +1011,7 @@ void cmNinjaTargetGenerator::WriteObjectBuildStatements(
   {
     std::vector<cmSourceFile const*> objectSources;
     this->GeneratorTarget->GetObjectSources(objectSources, config);
+
     for (cmSourceFile const* sf : objectSources) {
       this->WriteObjectBuildStatement(sf, config, fileConfig, firstForConfig);
     }

+ 2 - 1
Source/cmNinjaTargetGenerator.h

@@ -113,7 +113,8 @@ protected:
 
   /// @return the list of link dependency for the given target @a target.
   cmNinjaDeps ComputeLinkDeps(const std::string& linkLanguage,
-                              const std::string& config) const;
+                              const std::string& config,
+                              bool ignoreType = false) const;
 
   /// @return the source file path for the given @a source.
   std::string GetSourceFilePath(cmSourceFile const* source) const;

+ 10 - 0
Source/cmRulePlaceholderExpander.cxx

@@ -141,6 +141,16 @@ std::string cmRulePlaceholderExpander::ExpandRuleVariable(
       return replaceValues.DependencyFile;
     }
   }
+  if (replaceValues.Fatbinary) {
+    if (variable == "FATBINARY") {
+      return replaceValues.Fatbinary;
+    }
+  }
+  if (replaceValues.RegisterFile) {
+    if (variable == "REGISTER_FILE") {
+      return replaceValues.RegisterFile;
+    }
+  }
 
   if (replaceValues.Target) {
     if (variable == "TARGET_QUOTED") {

+ 2 - 0
Source/cmRulePlaceholderExpander.h

@@ -64,6 +64,8 @@ public:
     const char* SwiftOutputFileMap;
     const char* SwiftSources;
     const char* ISPCHeader;
+    const char* Fatbinary;
+    const char* RegisterFile;
   };
 
   // Expand rule variables in CMake of the type found in language rules

+ 3 - 4
Tests/Cuda/CMakeLists.txt

@@ -17,13 +17,12 @@ add_cuda_test_macro(Cuda.SeparableCompCXXOnly SeparableCompCXXOnly)
 add_cuda_test_macro(Cuda.Toolkit Toolkit)
 add_cuda_test_macro(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
 add_cuda_test_macro(Cuda.SharedRuntimePlusToolkit SharedRuntimePlusToolkit)
+add_cuda_test_macro(Cuda.Complex CudaComplex)
+add_cuda_test_macro(Cuda.ProperLinkFlags ProperLinkFlags)
 
-# Separable compilation is currently only supported on NVCC. Disable tests
-# using it for other compilers.
 if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
-  add_cuda_test_macro(Cuda.Complex CudaComplex)
+  # Clang lacks __CUDACC_VER*__ defines.
   add_cuda_test_macro(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
-  add_cuda_test_macro(Cuda.ProperLinkFlags ProperLinkFlags)
 endif()
 
 # The CUDA only ships the shared version of the toolkit libraries

+ 17 - 19
Tests/CudaOnly/CMakeLists.txt

@@ -12,33 +12,31 @@ add_cuda_test_macro(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusT
 add_cuda_test_macro(CudaOnly.Standard98 CudaOnlyStandard98)
 add_cuda_test_macro(CudaOnly.Toolkit CudaOnlyToolkit)
 add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs)
+add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
+add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
+add_cuda_test_macro(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
 
 if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
+  # Clang doesn't have flags for selecting the runtime.
   add_cuda_test_macro(CudaOnly.SharedRuntimeViaCUDAFlags CudaOnlySharedRuntimeViaCUDAFlags)
 
-  # Separable compilation is currently only supported on NVCC. Disable tests
-  # using it for other compilers.
-  add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
-  add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
-  add_cuda_test_macro(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
-
-  add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
-    ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
-    --build-and-test
-    "${CMAKE_CURRENT_SOURCE_DIR}/DontResolveDeviceSymbols/"
-    "${CMAKE_CURRENT_BINARY_DIR}/DontResolveDeviceSymbols/"
-    ${build_generator_args}
-    --build-project DontResolveDeviceSymbols
-    --build-options ${build_options}
-    --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
-  )
-  set_property(TEST "CudaOnly.DontResolveDeviceSymbols" APPEND
-    PROPERTY LABELS "CUDA")
-
   # Only NVCC defines __CUDACC_DEBUG__ when compiling in debug mode.
   add_cuda_test_macro(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
 endif()
 
+add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
+  ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
+  --build-and-test
+  "${CMAKE_CURRENT_SOURCE_DIR}/DontResolveDeviceSymbols/"
+  "${CMAKE_CURRENT_BINARY_DIR}/DontResolveDeviceSymbols/"
+  ${build_generator_args}
+  --build-project DontResolveDeviceSymbols
+  --build-options ${build_options}
+  --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
+)
+set_property(TEST "CudaOnly.DontResolveDeviceSymbols" APPEND
+  PROPERTY LABELS "CUDA")
+
 # The CUDA only ships the shared version of the toolkit libraries
 # on windows
 if(NOT WIN32)