Browse Source

CUDA: Add Device LTO support for nvcc

Fixes #22200
Robert Maynard 3 years ago
parent
commit
96bc59b1ca

+ 7 - 0
Help/release/dev/cuda-device-lto.rst

@@ -0,0 +1,7 @@
+cuda-device-lto
+---------------
+
+* ``CUDA`` language now supports device link time optimization when using
+  ``nvcc``. The :variable:`CMAKE_INTERPROCEDURAL_OPTIMIZATION` variable and
+  the associated :prop_tgt:`INTERPROCEDURAL_OPTIMIZATION` target property will
+  activate device LTO.

+ 24 - 13
Modules/CheckIPOSupported.cmake

@@ -76,6 +76,23 @@ endmacro()
 
 # Run IPO/LTO test
 macro(_ipo_run_language_check language)
+  set(_C_ext "c")
+  set(_CXX_ext "cpp")
+  set(_Fortran_ext "f")
+  string(COMPARE EQUAL "${language}" "CUDA" is_cuda)
+
+  set(ext ${_${language}_ext})
+  if(NOT "${ext}" STREQUAL "")
+    set(copy_sources foo.${ext} main.${ext})
+  elseif(is_cuda)
+    if(_CMAKE_CUDA_IPO_SUPPORTED_BY_CMAKE)
+      set("${X_RESULT}" YES PARENT_SCOPE)
+    endif()
+    return()
+  else()
+    message(FATAL_ERROR "Language not supported")
+  endif()
+
   set(testdir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/_CMakeLTOTest-${language}")
 
   file(REMOVE_RECURSE "${testdir}")
@@ -91,17 +108,6 @@ macro(_ipo_run_language_check language)
 
   set(try_compile_src "${CMAKE_ROOT}/Modules/CheckIPOSupported")
 
-  set(_C_ext "c")
-  set(_CXX_ext "cpp")
-  set(_Fortran_ext "f")
-
-  set(ext ${_${language}_ext})
-  if(NOT "${ext}" STREQUAL "")
-    set(copy_sources foo.${ext} main.${ext})
-  else()
-    message(FATAL_ERROR "Language not supported")
-  endif()
-
   # Use:
   # * TRY_COMPILE_PROJECT_NAME
   # * CMAKE_VERSION
@@ -211,6 +217,11 @@ function(check_ipo_supported)
       list(APPEND languages "C")
     endif()
 
+    list(FIND enabled_languages "CUDA" result)
+    if(NOT result EQUAL -1)
+      list(APPEND languages "CUDA")
+    endif()
+
     list(FIND enabled_languages "Fortran" result)
     if(NOT result EQUAL -1)
       list(APPEND languages "Fortran")
@@ -219,7 +230,7 @@ function(check_ipo_supported)
     string(COMPARE EQUAL "${languages}" "" no_languages)
     if(no_languages)
       _ipo_not_supported(
-          "no C/CXX/Fortran languages found in ENABLED_LANGUAGES global property"
+          "no C/CXX/CUDA/Fortran languages found in ENABLED_LANGUAGES global property"
       )
       return()
     endif()
@@ -227,7 +238,7 @@ function(check_ipo_supported)
     set(languages "${X_LANGUAGES}")
 
     set(unsupported_languages "${languages}")
-    list(REMOVE_ITEM unsupported_languages "C" "CXX" "Fortran")
+    list(REMOVE_ITEM unsupported_languages "C" "CXX" "CUDA" "Fortran")
     string(COMPARE NOTEQUAL "${unsupported_languages}" "" has_unsupported)
     if(has_unsupported)
       _ipo_not_supported(

+ 4 - 0
Modules/Compiler/Clang-CUDA.cmake

@@ -35,6 +35,10 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static")
 set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart")
 set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE   "")
 
+# Clang doesn't support CUDA device LTO
+set(_CMAKE_CUDA_IPO_SUPPORTED_BY_CMAKE NO)
+set(_CMAKE_CUDA_IPO_MAY_BE_SUPPORTED_BY_COMPILER NO)
+
 if(UNIX)
   list(APPEND CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "rt" "pthread" "dl")
 endif()

+ 8 - 0
Modules/Compiler/NVIDIA-CUDA.cmake

@@ -48,6 +48,13 @@ if((NOT DEFINED CMAKE_DEPENDS_USE_COMPILER OR CMAKE_DEPENDS_USE_COMPILER)
   set(CMAKE_CUDA_DEPENDS_USE_COMPILER TRUE)
 endif()
 
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
+  set(_CMAKE_CUDA_IPO_SUPPORTED_BY_CMAKE YES)
+  set(_CMAKE_CUDA_IPO_MAY_BE_SUPPORTED_BY_COMPILER YES)
+
+  set(CMAKE_CUDA_DEVICE_LINK_OPTIONS_IPO " -dlto")
+endif()
+
 if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
   set(CMAKE_CUDA_COMPILE_OPTIONS_PIE -Xcompiler=-fPIE)
   set(CMAKE_CUDA_COMPILE_OPTIONS_PIC -Xcompiler=-fPIC)
@@ -61,6 +68,7 @@ if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
   string(APPEND CMAKE_CUDA_FLAGS_MINSIZEREL_INIT " -O1 -DNDEBUG")
   string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO_INIT " -O2 -g -DNDEBUG")
 endif()
+
 set(CMAKE_SHARED_LIBRARY_CREATE_CUDA_FLAGS -shared)
 set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA -isystem=)
 

+ 27 - 3
Source/cmGeneratorTarget.cxx

@@ -916,11 +916,19 @@ bool cmGeneratorTarget::IsIPOEnabled(std::string const& lang,
     return false;
   }
 
-  if (lang != "C" && lang != "CXX" && lang != "Fortran") {
+  if (lang != "C" && lang != "CXX" && lang != "CUDA" && lang != "Fortran") {
     // We do not define IPO behavior for other languages.
     return false;
   }
 
+  if (lang == "CUDA") {
+    // CUDA IPO requires both CUDA_ARCHITECTURES and CUDA_SEPARABLE_COMPILATION
+    if (cmIsOff(this->GetSafeProperty("CUDA_ARCHITECTURES")) ||
+        cmIsOff(this->GetSafeProperty("CUDA_SEPARABLE_COMPILATION"))) {
+      return false;
+    }
+  }
+
   cmPolicies::PolicyStatus cmp0069 = this->GetPolicyStatusCMP0069();
 
   if (cmp0069 == cmPolicies::OLD || cmp0069 == cmPolicies::WARN) {
@@ -3428,7 +3436,9 @@ void cmGeneratorTarget::AddExplicitLanguageFlags(std::string& flags,
                                              "EXPLICIT_LANGUAGE");
 }
 
-void cmGeneratorTarget::AddCUDAArchitectureFlags(std::string& flags) const
+void cmGeneratorTarget::AddCUDAArchitectureFlags(cmBuildStep compileOrLink,
+                                                 const std::string& config,
+                                                 std::string& flags) const
 {
   std::string property = this->GetSafeProperty("CUDA_ARCHITECTURES");
 
@@ -3460,6 +3470,7 @@ void cmGeneratorTarget::AddCUDAArchitectureFlags(std::string& flags) const
 
   std::string const& compiler =
     this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID");
+  const bool ipoEnabled = this->IsIPOEnabled("CUDA", config);
 
   // Check for special modes: `all`, `all-major`.
   if (property == "all" || property == "all-major") {
@@ -3539,6 +3550,13 @@ void cmGeneratorTarget::AddCUDAArchitectureFlags(std::string& flags) const
   }
 
   if (compiler == "NVIDIA") {
+    if (ipoEnabled && compileOrLink == cmBuildStep::Link) {
+      if (cmValue cudaIPOFlags =
+            this->Makefile->GetDefinition("CMAKE_CUDA_LINK_OPTIONS_IPO")) {
+        flags += cudaIPOFlags;
+      }
+    }
+
     for (CudaArchitecture& architecture : architectures) {
       flags +=
         " --generate-code=arch=compute_" + architecture.name + ",code=[";
@@ -3551,7 +3569,13 @@ void cmGeneratorTarget::AddCUDAArchitectureFlags(std::string& flags) const
         }
       }
 
-      if (architecture.real) {
+      if (ipoEnabled) {
+        if (compileOrLink == cmBuildStep::Compile) {
+          flags += "lto_" + architecture.name;
+        } else if (compileOrLink == cmBuildStep::Link) {
+          flags += "sm_" + architecture.name;
+        }
+      } else if (architecture.real) {
         flags += "sm_" + architecture.name;
       }
 

+ 4 - 1
Source/cmGeneratorTarget.h

@@ -23,6 +23,7 @@
 #include "cmStateTypes.h"
 #include "cmValue.h"
 
+enum class cmBuildStep;
 class cmComputeLinkInformation;
 class cmCustomCommand;
 class cmGlobalGenerator;
@@ -471,7 +472,9 @@ public:
   void AddExplicitLanguageFlags(std::string& flags,
                                 cmSourceFile const& sf) const;
 
-  void AddCUDAArchitectureFlags(std::string& flags) const;
+  void AddCUDAArchitectureFlags(cmBuildStep compileOrLink,
+                                const std::string& config,
+                                std::string& flags) const;
   void AddCUDAToolkitFlags(std::string& flags) const;
 
   void AddHIPArchitectureFlags(std::string& flags) const;

+ 2 - 2
Source/cmGhsMultiTargetGenerator.cxx

@@ -183,8 +183,8 @@ void cmGhsMultiTargetGenerator::SetCompilerFlags(std::string const& config,
   auto i = this->FlagsByLanguage.find(language);
   if (i == this->FlagsByLanguage.end()) {
     std::string flags;
-    this->LocalGenerator->AddLanguageFlags(flags, this->GeneratorTarget,
-                                           language, config);
+    this->LocalGenerator->AddLanguageFlags(
+      flags, this->GeneratorTarget, cmBuildStep::Compile, language, config);
     this->LocalGenerator->AddCMP0018Flags(flags, this->GeneratorTarget,
                                           language, config);
     this->LocalGenerator->AddVisibilityPresetFlags(

+ 2 - 2
Source/cmGlobalXCodeGenerator.cxx

@@ -2368,8 +2368,8 @@ void cmGlobalXCodeGenerator::CreateBuildSettings(cmGeneratorTarget* gtgt,
     std::string& flags = cflags[lang];
 
     // Add language-specific flags.
-    this->CurrentLocalGenerator->AddLanguageFlags(flags, gtgt, lang,
-                                                  configName);
+    this->CurrentLocalGenerator->AddLanguageFlags(
+      flags, gtgt, cmBuildStep::Compile, lang, configName);
 
     if (gtgt->IsIPOEnabled(lang, configName)) {
       this->CurrentLocalGenerator->AppendFeatureOptions(flags, lang, "IPO");

+ 20 - 0
Source/cmLinkLineDeviceComputer.cxx

@@ -68,6 +68,26 @@ bool cmLinkLineDeviceComputer::ComputeRequiresDeviceLinking(
     });
 }
 
+bool cmLinkLineDeviceComputer::ComputeRequiresDeviceLinkingIPOFlag(
+  cmComputeLinkInformation& cli)
+{
+  // Determine if this item might requires device linking.
+  // For this we only consider targets
+  using ItemVector = cmComputeLinkInformation::ItemVector;
+  ItemVector const& items = cli.GetItems();
+  std::string config = cli.GetConfig();
+  return std::any_of(
+    items.begin(), items.end(),
+    [config](cmComputeLinkInformation::Item const& item) -> bool {
+      return item.Target &&
+        item.Target->GetType() == cmStateEnums::STATIC_LIBRARY &&
+        // this dependency requires us to device link it
+        !item.Target->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS") &&
+        item.Target->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION") &&
+        item.Target->IsIPOEnabled("CUDA", config);
+    });
+}
+
 void cmLinkLineDeviceComputer::ComputeLinkLibraries(
   cmComputeLinkInformation& cli, std::string const& stdLibString,
   std::vector<BT<std::string>>& linkLibraries)

+ 1 - 0
Source/cmLinkLineDeviceComputer.h

@@ -30,6 +30,7 @@ public:
     delete;
 
   bool ComputeRequiresDeviceLinking(cmComputeLinkInformation& cli);
+  bool ComputeRequiresDeviceLinkingIPOFlag(cmComputeLinkInformation& cli);
 
   void ComputeLinkLibraries(
     cmComputeLinkInformation& cli, std::string const& stdLibString,

+ 21 - 4
Source/cmLocalGenerator.cxx

@@ -36,6 +36,7 @@
 #include "cmInstallScriptGenerator.h"
 #include "cmInstallTargetGenerator.h"
 #include "cmLinkLineComputer.h"
+#include "cmLinkLineDeviceComputer.h"
 #include "cmMakefile.h"
 #include "cmRange.h"
 #include "cmRulePlaceholderExpander.h"
@@ -1381,7 +1382,7 @@ std::vector<BT<std::string>> cmLocalGenerator::GetStaticLibraryFlags(
 }
 
 void cmLocalGenerator::GetDeviceLinkFlags(
-  cmLinkLineComputer& linkLineComputer, const std::string& config,
+  cmLinkLineDeviceComputer& linkLineComputer, const std::string& config,
   std::string& linkLibs, std::string& linkFlags, std::string& frameworkPath,
   std::string& linkPath, cmGeneratorTarget* target)
 {
@@ -1389,6 +1390,18 @@ void cmLocalGenerator::GetDeviceLinkFlags(
 
   cmComputeLinkInformation* pcli = target->GetLinkInformation(config);
 
+  auto linklang = linkLineComputer.GetLinkerLanguage(target, config);
+  auto ipoEnabled = target->IsIPOEnabled(linklang, config);
+  if (!ipoEnabled) {
+    ipoEnabled = linkLineComputer.ComputeRequiresDeviceLinkingIPOFlag(*pcli);
+  }
+  if (ipoEnabled) {
+    if (cmValue cudaIPOFlags = this->Makefile->GetDefinition(
+          "CMAKE_CUDA_DEVICE_LINK_OPTIONS_IPO")) {
+      linkFlags += cudaIPOFlags;
+    }
+  }
+
   if (pcli) {
     // Compute the required device link libraries when
     // resolving gpu lang device symbols
@@ -1396,6 +1409,8 @@ void cmLocalGenerator::GetDeviceLinkFlags(
                               linkPath);
   }
 
+  // iterate link deps and see if any of them need IPO
+
   std::vector<std::string> linkOpts;
   target->GetLinkOptions(linkOpts, config, "CUDA");
   // LINK_OPTIONS are escaped.
@@ -1590,7 +1605,8 @@ std::vector<BT<std::string>> cmLocalGenerator::GetTargetCompileFlags(
   cmMakefile* mf = this->GetMakefile();
 
   // Add language-specific flags.
-  this->AddLanguageFlags(compileFlags, target, lang, config);
+  this->AddLanguageFlags(compileFlags, target, cmBuildStep::Compile, lang,
+                         config);
 
   if (target->IsIPOEnabled(lang, config)) {
     this->AppendFeatureOptions(compileFlags, lang, "IPO");
@@ -1903,6 +1919,7 @@ void cmLocalGenerator::AddArchitectureFlags(std::string& flags,
 
 void cmLocalGenerator::AddLanguageFlags(std::string& flags,
                                         cmGeneratorTarget const* target,
+                                        cmBuildStep compileOrLink,
                                         const std::string& lang,
                                         const std::string& config)
 {
@@ -1926,7 +1943,7 @@ void cmLocalGenerator::AddLanguageFlags(std::string& flags,
       }
     }
   } else if (lang == "CUDA") {
-    target->AddCUDAArchitectureFlags(flags);
+    target->AddCUDAArchitectureFlags(compileOrLink, config, flags);
     target->AddCUDAToolkitFlags(flags);
   } else if (lang == "ISPC") {
     target->AddISPCTargetFlags(flags);
@@ -2038,7 +2055,7 @@ void cmLocalGenerator::AddLanguageFlagsForLinking(
     this->AddCompilerRequirementFlag(flags, target, lang, config);
   }
 
-  this->AddLanguageFlags(flags, target, lang, config);
+  this->AddLanguageFlags(flags, target, cmBuildStep::Link, lang, config);
 
   if (target->IsIPOEnabled(lang, config)) {
     this->AppendFeatureOptions(flags, lang, "IPO");

+ 11 - 2
Source/cmLocalGenerator.h

@@ -35,6 +35,7 @@ class cmGeneratorTarget;
 class cmGlobalGenerator;
 class cmImplicitDependsList;
 class cmLinkLineComputer;
+class cmLinkLineDeviceComputer;
 class cmMakefile;
 class cmRulePlaceholderExpander;
 class cmSourceFile;
@@ -59,6 +60,13 @@ enum class cmDependencyScannerKind
   Compiler
 };
 
+/** What to compute language flags for */
+enum class cmBuildStep
+{
+  Compile,
+  Link
+};
+
 /** Target and source file which have a specific output.  */
 struct cmSourcesWithOutput
 {
@@ -143,7 +151,8 @@ public:
                             const std::string& filterArch = std::string());
 
   void AddLanguageFlags(std::string& flags, cmGeneratorTarget const* target,
-                        const std::string& lang, const std::string& config);
+                        cmBuildStep compileOrLink, const std::string& lang,
+                        const std::string& config);
   void AddLanguageFlagsForLinking(std::string& flags,
                                   cmGeneratorTarget const* target,
                                   const std::string& lang,
@@ -476,7 +485,7 @@ public:
 
   /** Fill out these strings for the given target.  Libraries to link,
    *  flags, and linkflags. */
-  void GetDeviceLinkFlags(cmLinkLineComputer& linkLineComputer,
+  void GetDeviceLinkFlags(cmLinkLineDeviceComputer& linkLineComputer,
                           const std::string& config, std::string& linkLibs,
                           std::string& linkFlags, std::string& frameworkPath,
                           std::string& linkPath, cmGeneratorTarget* target);

+ 2 - 1
Source/cmLocalVisualStudio7Generator.cxx

@@ -680,7 +680,8 @@ void cmLocalVisualStudio7Generator::WriteConfiguration(
     langForClCompile = linkLanguage;
     if (langForClCompile == "C" || langForClCompile == "CXX" ||
         langForClCompile == "Fortran") {
-      this->AddLanguageFlags(flags, target, langForClCompile, configName);
+      this->AddLanguageFlags(flags, target, cmBuildStep::Compile,
+                             langForClCompile, configName);
     }
     // set the correct language
     if (linkLanguage == "C") {

+ 9 - 8
Source/cmMakefileExecutableTargetGenerator.cxx

@@ -136,17 +136,11 @@ void cmMakefileExecutableTargetGenerator::WriteNvidiaDeviceExecutableRule(
   std::vector<std::string> depends;
   this->AppendLinkDepends(depends, linkLanguage);
 
-  // Build a list of compiler flags and linker flags.
-  std::string langFlags;
-  std::string linkFlags;
-
   // Add language feature flags.
+  std::string langFlags;
   this->LocalGenerator->AddLanguageFlagsForLinking(
     langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName());
 
-  // Add device-specific linker flags.
-  this->GetDeviceLinkFlags(linkFlags, linkLanguage);
-
   // Construct a list of files associated with this executable that
   // may need to be cleaned.
   std::vector<std::string> exeCleanFiles;
@@ -173,13 +167,20 @@ void cmMakefileExecutableTargetGenerator::WriteNvidiaDeviceExecutableRule(
     // Set path conversion for link script shells.
     this->LocalGenerator->SetLinkScriptShell(useLinkScript);
 
-    std::unique_ptr<cmLinkLineComputer> linkLineComputer(
+    std::unique_ptr<cmLinkLineDeviceComputer> linkLineComputer(
       new cmLinkLineDeviceComputer(
         this->LocalGenerator,
         this->LocalGenerator->GetStateSnapshot().GetDirectory()));
     linkLineComputer->SetForResponse(useResponseFileForLibs);
     linkLineComputer->SetRelink(relink);
 
+    // Create set of linking flags.
+    std::string linkFlags;
+    std::string ignored_;
+    this->LocalGenerator->GetDeviceLinkFlags(
+      *linkLineComputer, this->GetConfigName(), ignored_, linkFlags, ignored_,
+      ignored_, this->GeneratorTarget);
+
     // Collect up flags to link in needed libraries.
     std::string linkLibs;
     this->CreateLinkLibs(

+ 8 - 5
Source/cmMakefileLibraryTargetGenerator.cxx

@@ -287,10 +287,6 @@ void cmMakefileLibraryTargetGenerator::WriteNvidiaDeviceLibraryRules(
   this->LocalGenerator->AddLanguageFlagsForLinking(
     langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName());
 
-  // Create set of linking flags.
-  std::string linkFlags;
-  this->GetDeviceLinkFlags(linkFlags, linkLanguage);
-
   // Clean files associated with this library.
   std::set<std::string> libCleanFiles;
   libCleanFiles.insert(
@@ -315,13 +311,20 @@ void cmMakefileLibraryTargetGenerator::WriteNvidiaDeviceLibraryRules(
 
     // Collect up flags to link in needed libraries.
     std::string linkLibs;
-    std::unique_ptr<cmLinkLineComputer> linkLineComputer(
+    std::unique_ptr<cmLinkLineDeviceComputer> linkLineComputer(
       new cmLinkLineDeviceComputer(
         this->LocalGenerator,
         this->LocalGenerator->GetStateSnapshot().GetDirectory()));
     linkLineComputer->SetForResponse(useResponseFileForLibs);
     linkLineComputer->SetRelink(relink);
 
+    // Create set of linking flags.
+    std::string linkFlags;
+    std::string ignored_;
+    this->LocalGenerator->GetDeviceLinkFlags(
+      *linkLineComputer, this->GetConfigName(), ignored_, linkFlags, ignored_,
+      ignored_, this->GeneratorTarget);
+
     this->CreateLinkLibs(
       linkLineComputer.get(), linkLibs, useResponseFileForLibs, depends,
       cmMakefileTargetGenerator::ResponseFlagFor::DeviceLink);

+ 7 - 4
Source/cmVisualStudio10TargetGenerator.cxx

@@ -3300,6 +3300,7 @@ bool cmVisualStudio10TargetGenerator::ComputeClOptions(
   this->LangForClCompile = langForClCompile;
   if (!langForClCompile.empty()) {
     this->LocalGenerator->AddLanguageFlags(flags, this->GeneratorTarget,
+                                           cmBuildStep::Compile,
                                            langForClCompile, configName);
     this->LocalGenerator->AddCompileOptions(flags, this->GeneratorTarget,
                                             langForClCompile, configName);
@@ -3675,8 +3676,8 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaOptions(
 
   // Get compile flags for CUDA in this directory.
   std::string flags;
-  this->LocalGenerator->AddLanguageFlags(flags, this->GeneratorTarget, "CUDA",
-                                         configName);
+  this->LocalGenerator->AddLanguageFlags(
+    flags, this->GeneratorTarget, cmBuildStep::Compile, "CUDA", configName);
   this->LocalGenerator->AddCompileOptions(flags, this->GeneratorTarget, "CUDA",
                                           configName);
 
@@ -3947,7 +3948,8 @@ bool cmVisualStudio10TargetGenerator::ComputeMasmOptions(
 
   std::string flags;
   this->LocalGenerator->AddLanguageFlags(flags, this->GeneratorTarget,
-                                         "ASM_MASM", configName);
+                                         cmBuildStep::Compile, "ASM_MASM",
+                                         configName);
 
   masmOptions.Parse(flags);
 
@@ -3999,7 +4001,8 @@ bool cmVisualStudio10TargetGenerator::ComputeNasmOptions(
 
   std::string flags;
   this->LocalGenerator->AddLanguageFlags(flags, this->GeneratorTarget,
-                                         "ASM_NASM", configName);
+                                         cmBuildStep::Compile, "ASM_NASM",
+                                         configName);
   flags += " -f";
   flags += this->Makefile->GetSafeDefinition("CMAKE_ASM_NASM_OBJECT_FORMAT");
   nasmOptions.Parse(flags);

+ 5 - 0
Tests/CMakeLists.txt

@@ -618,6 +618,11 @@ if(BUILD_TESTING)
   set(Module.CheckIPOSupported-CXX_BUILD_OPTIONS -DCMake_TEST_IPO_WORKS_CXX=${CMake_TEST_IPO_WORKS_CXX})
   ADD_TEST_MACRO(Module.CheckIPOSupported-CXX CheckIPOSupported-CXX)
 
+  if(CMake_TEST_CUDA)
+    ADD_TEST_MACRO(Module.CheckIPOSupported-CUDA CheckIPOSupported-CUDA)
+    set_property(TEST Module.CheckIPOSupported-CUDA APPEND PROPERTY LABELS "CUDA")
+  endif()
+
   if(CMAKE_Fortran_COMPILER)
     set(Module.CheckIPOSupported-Fortran_BUILD_OPTIONS -DCMake_TEST_IPO_WORKS_Fortran=${CMake_TEST_IPO_WORKS_Fortran})
     ADD_TEST_MACRO(Module.CheckIPOSupported-Fortran CheckIPOSupported-Fortran)

+ 13 - 11
Tests/CudaOnly/CMakeLists.txt

@@ -7,7 +7,6 @@ endmacro ()
 add_cuda_test_macro(CudaOnly.Architecture Architecture)
 add_cuda_test_macro(CudaOnly.ArchSpecial CudaOnlyArchSpecial)
 add_cuda_test_macro(CudaOnly.CompileFlags CudaOnlyCompileFlags)
-
 add_cuda_test_macro(CudaOnly.EnableStandard CudaOnlyEnableStandard)
 add_cuda_test_macro(CudaOnly.ExportPTX CudaOnlyExportPTX)
 add_cuda_test_macro(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit)
@@ -28,6 +27,19 @@ if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
   add_cuda_test_macro(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
 endif()
 
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+  add_cuda_test_macro(CudaOnly.StaticRuntimePlusToolkit CudaOnlyStaticRuntimePlusToolkit)
+endif()
+
+add_cuda_test_macro(CudaOnly.DeviceLTO CudaOnlyDeviceLTO)
+
+if(MSVC)
+  # Tests for features that only work with MSVC
+  add_cuda_test_macro(CudaOnly.PDB CudaOnlyPDB)
+endif()
+
 add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
   ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
   --build-and-test
@@ -41,16 +53,6 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
 set_property(TEST "CudaOnly.DontResolveDeviceSymbols" APPEND
   PROPERTY LABELS "CUDA")
 
-# The CUDA only ships the shared version of the toolkit libraries
-# on windows
-if(NOT WIN32)
-  add_cuda_test_macro(CudaOnly.StaticRuntimePlusToolkit CudaOnlyStaticRuntimePlusToolkit)
-endif()
-
-if(MSVC)
-  add_cuda_test_macro(CudaOnly.PDB CudaOnlyPDB)
-endif()
-
 add_test(NAME CudaOnly.RuntimeControls COMMAND
   ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
   --build-and-test

+ 37 - 0
Tests/CudaOnly/DeviceLTO/CMakeLists.txt

@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.18)
+project(DeviceLTO CUDA)
+
+# Goal:
+# Verify that we correctly compile with device LTO
+# Verify that device LTO requirements are propagated to
+# the final device link line
+
+add_library(CUDA_dlto STATIC file1.cu file2.cu file3.cu)
+add_executable(CudaOnlyDeviceLTO main.cu)
+
+set_target_properties(CUDA_dlto
+                      PROPERTIES
+                      CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}"
+                      CUDA_SEPARABLE_COMPILATION ON
+                      POSITION_INDEPENDENT_CODE ON)
+
+set_target_properties(CudaOnlyDeviceLTO
+                      PROPERTIES
+                      CUDA_SEPARABLE_COMPILATION ON
+                      CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}"
+                      )
+
+target_link_libraries(CudaOnlyDeviceLTO PRIVATE CUDA_dlto)
+
+include(CheckIPOSupported)
+check_ipo_supported(LANGUAGES CUDA RESULT ipo_supported)
+if(ipo_supported)
+  set_target_properties(CUDA_dlto
+                        PROPERTIES
+                        INTERPROCEDURAL_OPTIMIZATION ON)
+
+  # When non-LTO variants (i.e. virtual) are built together with LTO ones the
+  # linker warns about missing device LTO for the virtual architectures.
+  # Ignore these warnings.
+  target_link_options(CudaOnlyDeviceLTO PRIVATE "$<DEVICE_LINK:-w>")
+endif()

+ 17 - 0
Tests/CudaOnly/DeviceLTO/file1.cu

@@ -0,0 +1,17 @@
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+extern __device__ int file2_func(int);
+void __global__ kernel(int x)
+{
+  file2_func(x);
+}
+
+EXPORT int launch_kernel(int x)
+{
+  kernel<<<1, 1>>>(x);
+  return x;
+}

+ 5 - 0
Tests/CudaOnly/DeviceLTO/file2.cu

@@ -0,0 +1,5 @@
+extern __device__ int file3_func(int);
+int __device__ file2_func(int x)
+{
+  return x + file3_func(x);
+}

+ 4 - 0
Tests/CudaOnly/DeviceLTO/file3.cu

@@ -0,0 +1,4 @@
+int __device__ file3_func(int x)
+{
+  return x * x * x;
+}

+ 62 - 0
Tests/CudaOnly/DeviceLTO/main.cu

@@ -0,0 +1,62 @@
+#include <iostream>
+
+#include "cuda.h"
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#else
+#  define IMPORT
+#endif
+
+IMPORT int launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    std::cout << "prop.major: " << prop.major << std::endl;
+    err = cudaSetDevice(i);
+    if (err != cudaSuccess) {
+      std::cout << "Could not select CUDA device " << i << std::endl;
+    } else {
+      return 0;
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card" << std::endl;
+
+  return 1;
+}
+
+int main()
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  launch_kernel(1);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "launch_kernel: kernel launch should have passed.\n "
+                 "Error message: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}

+ 32 - 0
Tests/Module/CheckIPOSupported-CUDA/CMakeLists.txt

@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.8)
+project(CheckIPOSupported-CUDA LANGUAGES CUDA)
+
+cmake_policy(SET CMP0069 NEW)
+
+include(CheckIPOSupported)
+check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output)
+if(ipo_supported)
+  set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
+endif()
+
+if(NOT ipo_supported AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA"
+   AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
+  message(FATAL_ERROR "CheckIPOSupported failed to correctly identify NVIDIA CUDA IPO support")
+endif()
+
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+add_library(foo STATIC foo.cu)
+set_target_properties(foo PROPERTIES
+      WINDOWS_EXPORT_ALL_SYMBOLS ON
+      POSITION_INDEPENDENT_CODE ON)
+
+add_library(bar SHARED bar.cu)
+set_target_properties(bar PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+target_link_libraries(bar PRIVATE foo)
+
+add_executable(CheckIPOSupported-CUDA main.cu)
+target_link_libraries(CheckIPOSupported-CUDA PUBLIC bar)
+
+enable_testing()
+add_test(NAME CheckIPOSupported-CUDA COMMAND CheckIPOSupported-CUDA)

+ 12 - 0
Tests/Module/CheckIPOSupported-CUDA/bar.cu

@@ -0,0 +1,12 @@
+__device__ int foo_func(int);
+
+void __global__ bar_kernel(int x)
+{
+  foo_func(x);
+}
+
+int launch_kernel(int x)
+{
+  bar_kernel<<<1, 1>>>(x);
+  return x;
+}

+ 4 - 0
Tests/Module/CheckIPOSupported-CUDA/foo.cu

@@ -0,0 +1,4 @@
+extern __device__ int foo_func(int a)
+{
+  return a * 42 + 9;
+}

+ 62 - 0
Tests/Module/CheckIPOSupported-CUDA/main.cu

@@ -0,0 +1,62 @@
+#include <iostream>
+
+#include "cuda.h"
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#else
+#  define IMPORT
+#endif
+
+IMPORT int launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    std::cout << "prop.major: " << prop.major << std::endl;
+    err = cudaSetDevice(i);
+    if (err != cudaSuccess) {
+      std::cout << "Could not select CUDA device " << i << std::endl;
+    } else {
+      return 0;
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card" << std::endl;
+
+  return 1;
+}
+
+int main()
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  launch_kernel(1);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "launch_kernel: kernel launch should have passed.\n "
+                 "Error message: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}

+ 2 - 2
Tests/RunCMake/CheckIPOSupported/default-lang-none-stderr.txt

@@ -1,6 +1,6 @@
 ^CMake Error at .*/Modules/CheckIPOSupported\.cmake:[0-9]+ \(message\):
-  IPO is not supported \(no C/CXX/Fortran languages found in ENABLED_LANGUAGES
-  global property\)\.
+  IPO is not supported \(no C/CXX/CUDA/Fortran languages found in
+  ENABLED_LANGUAGES global property\)\.
 Call Stack \(most recent call first\):
   .*/Modules/CheckIPOSupported\.cmake:[0-9]+ \(_ipo_not_supported\)
   default-lang-none\.cmake:[0-9]+ \(check_ipo_supported\)