5 years ago · 1f9321c683
--- a/Help/manual/cmake-properties.7.rst
+++ b/Help/manual/cmake-properties.7.rst
@@ -173,6 +173,7 @@ Properties on Targets
 
				    /prop_tgt/CUDA_PTX_COMPILATION
			
 
				    /prop_tgt/CUDA_SEPARABLE_COMPILATION
			
 
				    /prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
			
 
				+   /prop_tgt/CUDA_RUNTIME_LIBRARY
			
 
				    /prop_tgt/CUDA_EXTENSIONS
			
 
				    /prop_tgt/CUDA_STANDARD
			
 
				    /prop_tgt/CUDA_STANDARD_REQUIRED
			
--- a/Help/manual/cmake-variables.7.rst
+++ b/Help/manual/cmake-variables.7.rst
@@ -373,6 +373,7 @@ Variables that Control the Build
 
				    /variable/CMAKE_CTEST_ARGUMENTS
			
 
				    /variable/CMAKE_CUDA_SEPARABLE_COMPILATION
			
 
				    /variable/CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS
			
 
				+   /variable/CMAKE_CUDA_RUNTIME_LIBRARY
			
 
				    /variable/CMAKE_DEBUG_POSTFIX
			
 
				    /variable/CMAKE_DISABLE_PRECOMPILE_HEADERS
			
 
				    /variable/CMAKE_ENABLE_EXPORTS
			
--- a/Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
+++ b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
@@ -0,0 +1,9 @@
 
				+``None``
			
 
				+  Link with ``-cudart=none`` or equivalent flag(s) to use no CUDA
			
 
				+  runtime library.
			
 
				+``Shared``
			
 
				+  Link with ``-cudart=shared`` or equivalent flag(s) to use a
			
 
				+  dynamically-linked CUDA runtime library.
			
 
				+``Static``
			
 
				+  Link with ``-cudart=static`` or equivalent flag(s) to use a
			
 
				+  statically-linked CUDA runtime library.
			
--- a/Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst
+++ b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst
@@ -0,0 +1,21 @@
 
				+CUDA_RUNTIME_LIBRARY
			
 
				+--------------------
			
 
				+
			
 
				+Select the CUDA runtime library for use by compilers targeting the CUDA language.
			
 
				+
			
 
				+The allowed case insensitive values are:
			
 
				+
			
 
				+.. include:: CUDA_RUNTIME_LIBRARY-VALUES.txt
			
 
				+
			
 
				+Contents of ``CUDA_RUNTIME_LIBRARY`` may use
			
 
				+:manual:`generator expressions <cmake-generator-expressions(7)>`.
			
 
				+
			
 
				+If this property is not set then CMake uses the default value
			
 
				+``Static`` to select the CUDA runtime library.
			
 
				+
			
 
				+.. note::
			
 
				+
			
 
				+  This property has effect only when the ``CUDA`` language is enabled. To
			
 
				+  control the CUDA runtime linking when only using the CUDA SDK with the
			
 
				+  ``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
			
 
				+  module.
			
--- a/Help/release/dev/cuda-runtime-library.rst
+++ b/Help/release/dev/cuda-runtime-library.rst
@@ -0,0 +1,7 @@
 
				+cuda-runtime-library
			
 
				+--------------------
			
 
				+
			
 
				+* The :variable:`CMAKE_CUDA_RUNTIME_LIBRARY` variable and
			
 
				+  :prop_tgt:`CUDA_RUNTIME_LIBRARY` target property were introduced to
			
 
				+  select the CUDA runtime library used when linking targets that
			
 
				+  use CUDA.
			
--- a/Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst
+++ b/Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst
@@ -0,0 +1,24 @@
 
				+CMAKE_CUDA_RUNTIME_LIBRARY
			
 
				+--------------------------
			
 
				+
			
 
				+Select the CUDA runtime library for use by compilers targeting the MSVC ABI.
			
 
				+This variable is used to initialize the :prop_tgt:`CUDA_RUNTIME_LIBRARY`
			
 
				+property on all targets as they are created.
			
 
				+
			
 
				+The allowed case insensitive values are:
			
 
				+
			
 
				+.. include:: ../prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
			
 
				+
			
 
				+Contents of ``CMAKE_CUDA_RUNTIME_LIBRARY`` may use
			
 
				+:manual:`generator expressions <cmake-generator-expressions(7)>`.
			
 
				+
			
 
				+If this variable is not set then the :prop_tgt:`CUDA_RUNTIME_LIBRARY` target
			
 
				+property will not be set automatically.  If that property is not set then
			
 
				+CMake uses the default value ``Static`` to select the CUDA runtime library.
			
 
				+
			
 
				+.. note::
			
 
				+
			
 
				+  This property has effect only when the ``CUDA`` language is enabled. To
			
 
				+  control the CUDA runtime linking when only using the CUDA SDK with the
			
 
				+  ``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
			
 
				+  module.
			
--- a/Modules/CMakeTestCUDACompiler.cmake
+++ b/Modules/CMakeTestCUDACompiler.cmake
@@ -67,6 +67,17 @@ else()
 
				     set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}")
			
 
				   endif()
			
 
				 
			
 
				+  # Remove the following libraries from CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES and
			
 
				+  # CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES
			
 
				+  #
			
 
				+  # - cudart
			
 
				+  # - cudart_static
			
 
				+  # - cudadevrt
			
 
				+  #
			
 
				+  # These are controlled by CMAKE_CUDA_RUNTIME_LIBRARY
			
 
				+  list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
			
 
				+  list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
			
 
				+
			
 
				   # Re-configure to save learned information.
			
 
				   configure_file(
			
 
				     ${CMAKE_ROOT}/Modules/CMakeCUDACompiler.cmake.in
			
--- a/Modules/Compiler/NVIDIA-CUDA.cmake
+++ b/Modules/Compiler/NVIDIA-CUDA.cmake
@@ -43,6 +43,11 @@ endif()
 
				 set(CMAKE_SHARED_LIBRARY_CREATE_CUDA_FLAGS -shared)
			
 
				 set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA -isystem=)
			
 
				 
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC  "cudadevrt;cudart_static")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED  "cudadevrt;cudart")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE    "")
			
 
				+
			
 
				 if("x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
			
 
				   set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "")
			
 
				   set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "")
			
--- a/Modules/Platform/Windows-NVIDIA-CUDA.cmake
+++ b/Modules/Platform/Windows-NVIDIA-CUDA.cmake
@@ -69,6 +69,11 @@ else()
 
				 endif()
			
 
				 unset(_cmp0092)
			
 
				 
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC  "cudadevrt;cudart_static")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED  "cudadevrt;cudart")
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE    "")
			
 
				+
			
 
				 string(APPEND CMAKE_CUDA_FLAGS_INIT " ${PLATFORM_DEFINES_CUDA} -D_WINDOWS -Xcompiler=\"${_W3}${_FLAGS_CXX}\"")
			
 
				 string(APPEND CMAKE_CUDA_FLAGS_DEBUG_INIT " -Xcompiler=\"${_MDd}-Zi -Ob0 -Od ${_RTC1}\"")
			
 
				 string(APPEND CMAKE_CUDA_FLAGS_RELEASE_INIT " -Xcompiler=\"${_MD}-O2 -Ob2\" -DNDEBUG")
			
--- a/Source/cmComputeLinkInformation.cxx
+++ b/Source/cmComputeLinkInformation.cxx
@@ -10,6 +10,7 @@
 
				 
			
 
				 #include "cmAlgorithms.h"
			
 
				 #include "cmComputeLinkDepends.h"
			
 
				+#include "cmGeneratorExpression.h"
			
 
				 #include "cmGeneratorTarget.h"
			
 
				 #include "cmGlobalGenerator.h"
			
 
				 #include "cmListFileCache.h"
			
@@ -573,6 +574,15 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
 
				   cmGeneratorTarget::LinkClosure const* lc =
			
 
				     this->Target->GetLinkClosure(this->Config);
			
 
				   for (std::string const& li : lc->Languages) {
			
 
				+
			
 
				+    if (li == "CUDA") {
			
 
				+      // These need to go before the other implicit link information
			
 
				+      // as they could require symbols from those other library
			
 
				+      // Currently restricted to CUDA as it is the only language
			
 
				+      // we have documented runtime behavior controls for
			
 
				+      this->AddRuntimeLinkLibrary(li);
			
 
				+    }
			
 
				+
			
 
				     // Skip those of the linker language.  They are implicit.
			
 
				     if (li != this->LinkLanguage) {
			
 
				       this->AddImplicitLinkInfo(li);
			
@@ -580,6 +590,39 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
 
				   }
			
 
				 }
			
 
				 
			
 
				+void cmComputeLinkInformation::AddRuntimeLinkLibrary(std::string const& lang)
			
 
				+{ // Add the lang runtime library flags. This is activated by the presence
			
 
				+  // of a default selection whether or not it is overridden by a property.
			
 
				+  std::string defaultVar =
			
 
				+    cmStrCat("CMAKE_", lang, "_RUNTIME_LIBRARY_DEFAULT");
			
 
				+  const char* langRuntimeLibraryDefault =
			
 
				+    this->Makefile->GetDefinition(defaultVar);
			
 
				+  if (langRuntimeLibraryDefault && *langRuntimeLibraryDefault) {
			
 
				+    const char* runtimeLibraryValue =
			
 
				+      this->Target->GetProperty(cmStrCat(lang, "_RUNTIME_LIBRARY"));
			
 
				+    if (!runtimeLibraryValue) {
			
 
				+      runtimeLibraryValue = langRuntimeLibraryDefault;
			
 
				+    }
			
 
				+
			
 
				+    std::string runtimeLibrary =
			
 
				+      cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
			
 
				+        runtimeLibraryValue, this->Target->GetLocalGenerator(), this->Config,
			
 
				+        this->Target));
			
 
				+    if (!runtimeLibrary.empty()) {
			
 
				+      if (const char* runtimeLinkOptions = this->Makefile->GetDefinition(
			
 
				+            "CMAKE_" + lang + "_RUNTIME_LIBRARY_LINK_OPTIONS_" +
			
 
				+            runtimeLibrary)) {
			
 
				+        std::vector<std::string> libsVec = cmExpandedList(runtimeLinkOptions);
			
 
				+        for (std::string const& i : libsVec) {
			
 
				+          if (!cmContains(this->ImplicitLinkLibs, i)) {
			
 
				+            this->AddItem(i, nullptr);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 void cmComputeLinkInformation::AddImplicitLinkInfo(std::string const& lang)
			
 
				 {
			
 
				   // Add libraries for this language that are not implied by the
			
--- a/Source/cmComputeLinkInformation.h
+++ b/Source/cmComputeLinkInformation.h
@@ -172,6 +172,7 @@ private:
 
				   void LoadImplicitLinkInfo();
			
 
				   void AddImplicitLinkInfo();
			
 
				   void AddImplicitLinkInfo(std::string const& lang);
			
 
				+  void AddRuntimeLinkLibrary(std::string const& lang);
			
 
				   std::set<std::string> ImplicitLinkDirs;
			
 
				   std::set<std::string> ImplicitLinkLibs;
			
 
				 
			
--- a/Source/cmTarget.cxx
+++ b/Source/cmTarget.cxx
@@ -358,6 +358,7 @@ cmTarget::cmTarget(std::string const& name, cmStateEnums::TargetType type,
 
				     initProp("CUDA_COMPILER_LAUNCHER");
			
 
				     initProp("CUDA_SEPARABLE_COMPILATION");
			
 
				     initProp("CUDA_RESOLVE_DEVICE_SYMBOLS");
			
 
				+    initProp("CUDA_RUNTIME_LIBRARY");
			
 
				     initProp("LINK_SEARCH_START_STATIC");
			
 
				     initProp("LINK_SEARCH_END_STATIC");
			
 
				     initProp("Swift_LANGUAGE_VERSION");
			
--- a/Source/cmVisualStudio10TargetGenerator.cxx
+++ b/Source/cmVisualStudio10TargetGenerator.cxx
@@ -3650,18 +3650,7 @@ bool cmVisualStudio10TargetGenerator::ComputeLinkOptions(
 
				   this->AddLibraries(cli, libVec, vsTargetVec, config);
			
 
				   if (cmContains(linkClosure->Languages, "CUDA") &&
			
 
				       this->CudaOptions[config] != nullptr) {
			
 
				-    switch (this->CudaOptions[config]->GetCudaRuntime()) {
			
 
				-      case cmVisualStudioGeneratorOptions::CudaRuntimeStatic:
			
 
				-        libVec.push_back("cudadevrt.lib");
			
 
				-        libVec.push_back("cudart_static.lib");
			
 
				-        break;
			
 
				-      case cmVisualStudioGeneratorOptions::CudaRuntimeShared:
			
 
				-        libVec.push_back("cudadevrt.lib");
			
 
				-        libVec.push_back("cudart.lib");
			
 
				-        break;
			
 
				-      case cmVisualStudioGeneratorOptions::CudaRuntimeNone:
			
 
				-        break;
			
 
				-    }
			
 
				+    this->CudaOptions[config]->FixCudaRuntime(this->GeneratorTarget);
			
 
				   }
			
 
				   std::string standardLibsVar =
			
 
				     cmStrCat("CMAKE_", linkLanguage, "_STANDARD_LIBRARIES");
			
--- a/Source/cmVisualStudioGeneratorOptions.cxx
+++ b/Source/cmVisualStudioGeneratorOptions.cxx
@@ -3,6 +3,8 @@
 
				 #include <cm/iterator>
			
 
				 
			
 
				 #include "cmAlgorithms.h"
			
 
				+#include "cmGeneratorExpression.h"
			
 
				+#include "cmGeneratorTarget.h"
			
 
				 #include "cmLocalVisualStudioGenerator.h"
			
 
				 #include "cmOutputConverter.h"
			
 
				 #include "cmSystemTools.h"
			
@@ -149,25 +151,33 @@ bool cmVisualStudioGeneratorOptions::UsingSBCS() const
 
				   return false;
			
 
				 }
			
 
				 
			
 
				-cmVisualStudioGeneratorOptions::CudaRuntime
			
 
				-cmVisualStudioGeneratorOptions::GetCudaRuntime() const
			
 
				+void cmVisualStudioGeneratorOptions::FixCudaRuntime(cmGeneratorTarget* target)
			
 
				 {
			
 
				   std::map<std::string, FlagValue>::const_iterator i =
			
 
				     this->FlagMap.find("CudaRuntime");
			
 
				-  if (i != this->FlagMap.end() && i->second.size() == 1) {
			
 
				-    std::string const& cudaRuntime = i->second[0];
			
 
				-    if (cudaRuntime == "Static") {
			
 
				-      return CudaRuntimeStatic;
			
 
				-    }
			
 
				-    if (cudaRuntime == "Shared") {
			
 
				-      return CudaRuntimeShared;
			
 
				-    }
			
 
				-    if (cudaRuntime == "None") {
			
 
				-      return CudaRuntimeNone;
			
 
				+  if (i == this->FlagMap.end()) {
			
 
				+    // User didn't provide am override so get the property value
			
 
				+    const char* runtimeLibraryValue =
			
 
				+      target->GetProperty("CUDA_RUNTIME_LIBRARY");
			
 
				+    if (runtimeLibraryValue) {
			
 
				+      std::string cudaRuntime =
			
 
				+        cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
			
 
				+          runtimeLibraryValue, this->LocalGenerator, this->Configuration,
			
 
				+          target));
			
 
				+      if (cudaRuntime == "STATIC") {
			
 
				+        this->AddFlag("CudaRuntime", "Static");
			
 
				+      }
			
 
				+      if (cudaRuntime == "SHARED") {
			
 
				+        this->AddFlag("CudaRuntime", "Shared");
			
 
				+      }
			
 
				+      if (cudaRuntime == "NONE") {
			
 
				+        this->AddFlag("CudaRuntime", "None");
			
 
				+      }
			
 
				+    } else {
			
 
				+      // nvcc default is static
			
 
				+      this->AddFlag("CudaRuntime", "Static");
			
 
				     }
			
 
				   }
			
 
				-  // nvcc default is static
			
 
				-  return CudaRuntimeStatic;
			
 
				 }
			
 
				 
			
 
				 void cmVisualStudioGeneratorOptions::FixCudaCodeGeneration()
			
--- a/Source/cmVisualStudioGeneratorOptions.h
+++ b/Source/cmVisualStudioGeneratorOptions.h
@@ -13,6 +13,7 @@
 
				 #include "cmIDEOptions.h"
			
 
				 
			
 
				 class cmLocalVisualStudioGenerator;
			
 
				+class cmGeneratorTarget;
			
 
				 
			
 
				 using cmVS7FlagTable = cmIDEFlagTable;
			
 
				 
			
@@ -61,15 +62,8 @@ public:
 
				   bool UsingUnicode() const;
			
 
				   bool UsingSBCS() const;
			
 
				 
			
 
				-  enum CudaRuntime
			
 
				-  {
			
 
				-    CudaRuntimeStatic,
			
 
				-    CudaRuntimeShared,
			
 
				-    CudaRuntimeNone
			
 
				-  };
			
 
				-  CudaRuntime GetCudaRuntime() const;
			
 
				-
			
 
				   void FixCudaCodeGeneration();
			
 
				+  void FixCudaRuntime(cmGeneratorTarget* target);
			
 
				 
			
 
				   void FixManifestUACFlags();
			
 
				 
			
--- a/Tests/Cuda/Complex/CMakeLists.txt
+++ b/Tests/Cuda/Complex/CMakeLists.txt
@@ -22,18 +22,11 @@ set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 
				 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
			
 
				 
			
 
				 add_library(CudaComplexCppBase SHARED dynamic.cpp)
			
 
				-add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
			
 
				-set_target_properties(CudaComplexSeperableLib
			
 
				-                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
			
 
				-set_target_properties( CudaComplexSeperableLib
			
 
				-                       PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				-
			
 
				 add_library(CudaComplexSharedLib SHARED dynamic.cu)
			
 
				 target_link_libraries(CudaComplexSharedLib PUBLIC CudaComplexCppBase)
			
 
				 
			
 
				+add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
			
 
				 add_library(CudaComplexMixedLib SHARED mixed.cpp mixed.cu)
			
 
				-set_target_properties(CudaComplexMixedLib
			
 
				-                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
			
 
				 target_link_libraries(CudaComplexMixedLib
			
 
				                       PUBLIC CudaComplexSharedLib
			
 
				                       PRIVATE CudaComplexSeperableLib)
			
@@ -41,7 +34,27 @@ target_link_libraries(CudaComplexMixedLib
 
				 add_executable(CudaComplex main.cpp)
			
 
				 target_link_libraries(CudaComplex PUBLIC CudaComplexMixedLib)
			
 
				 
			
 
				+
			
 
				+set_target_properties(CudaComplexMixedLib
			
 
				+                      CudaComplexSeperableLib
			
 
				+                      PROPERTIES
			
 
				+                        POSITION_INDEPENDENT_CODE  ON
			
 
				+                        CUDA_SEPARABLE_COMPILATION ON
			
 
				+                      )
			
 
				+set_target_properties(CudaComplexMixedLib
			
 
				+                      CudaComplexSharedLib
			
 
				+                      PROPERTIES
			
 
				+                        CUDA_RUNTIME_LIBRARY shared
			
 
				+                      )
			
 
				+
			
 
				+
			
 
				 if(APPLE)
			
 
				   # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
			
 
				   set_property(TARGET CudaComplex PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
			
 
				 endif()
			
 
				+
			
 
				+if(UNIX)
			
 
				+  # Help the shared cuda runtime find libcudart as it is not located
			
 
				+  # in a default system searched location
			
 
				+  set_property(TARGET CudaComplexMixedLib PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
			
 
				+endif()
			
--- a/Tests/Cuda/Complex/dynamic.cu
+++ b/Tests/Cuda/Complex/dynamic.cu
@@ -54,17 +54,20 @@ EXPORT int choose_cuda_device()
 
				   return 1;
			
 
				 }
			
 
				 
			
 
				-EXPORT void cuda_dynamic_lib_func()
			
 
				+EXPORT bool cuda_dynamic_lib_func()
			
 
				 {
			
 
				-  DetermineIfValidCudaDevice<<<1, 1>>>();
			
 
				   cudaError_t err = cudaGetLastError();
			
 
				   if (err != cudaSuccess) {
			
 
				-    std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
			
 
				+    std::cerr << "DetermineIfValidCudaDevice [Per Launch] failed: "
			
 
				               << cudaGetErrorString(err) << std::endl;
			
 
				+    return false;
			
 
				   }
			
 
				+  DetermineIfValidCudaDevice<<<1, 1>>>();
			
 
				   err = cudaDeviceSynchronize();
			
 
				   if (err != cudaSuccess) {
			
 
				-    std::cerr << "DetermineIfValidCudaDevice [ASYNC] failed: "
			
 
				+    std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
			
 
				               << cudaGetErrorString(cudaGetLastError()) << std::endl;
			
 
				+    return false;
			
 
				   }
			
 
				+  return true;
			
 
				 }
			
--- a/Tests/Cuda/Complex/main.cpp
+++ b/Tests/Cuda/Complex/main.cpp
@@ -22,5 +22,6 @@ int main(int argc, char** argv)
 
				 
			
 
				   int r1 = call_cuda_seperable_code(42);
			
 
				   int r2 = mixed_launch_kernel(42);
			
 
				+
			
 
				   return (r1 == 42 || r2 == 42) ? 1 : 0;
			
 
				 }
			
--- a/Tests/Cuda/Complex/mixed.cu
+++ b/Tests/Cuda/Complex/mixed.cu
@@ -15,7 +15,7 @@
 
				 result_type __device__ file1_func(int x);
			
 
				 result_type_dynamic __device__ file2_func(int x);
			
 
				 
			
 
				-IMPORT void __host__ cuda_dynamic_lib_func();
			
 
				+IMPORT bool __host__ cuda_dynamic_lib_func();
			
 
				 
			
 
				 static __global__ void mixed_kernel(result_type* r, int x)
			
 
				 {
			
@@ -25,7 +25,9 @@ static __global__ void mixed_kernel(result_type* r, int x)
 
				 
			
 
				 EXPORT int mixed_launch_kernel(int x)
			
 
				 {
			
 
				-  cuda_dynamic_lib_func();
			
 
				+  if (!cuda_dynamic_lib_func()) {
			
 
				+    return x;
			
 
				+  }
			
 
				 
			
 
				   result_type* r;
			
 
				   cudaError_t err = cudaMallocManaged(&r, sizeof(result_type));
			
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -5,10 +5,21 @@ ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
 
				 ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
			
 
				 ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
			
 
				 ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
			
 
				+ADD_TEST_MACRO(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit)
			
 
				 ADD_TEST_MACRO(CudaOnly.Standard98 CudaOnlyStandard98)
			
 
				 ADD_TEST_MACRO(CudaOnly.Toolkit CudaOnlyToolkit)
			
 
				 ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
			
 
				 
			
 
				+# The CUDA only ships the shared version of the toolkit libraries
			
 
				+# on windows
			
 
				+if(NOT WIN32)
			
 
				+  ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
			
 
				+endif()
			
 
				+
			
 
				+if(MSVC)
			
 
				+  ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
			
 
				+endif()
			
 
				+
			
 
				 add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
			
 
				   ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
			
 
				   --build-and-test
			
@@ -20,6 +31,14 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
 
				   --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
			
 
				   )
			
 
				 
			
 
				-if(MSVC)
			
 
				-  ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
			
 
				-endif()
			
 
				+add_test(NAME CudaOnly.RuntimeControls COMMAND
			
 
				+  ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
			
 
				+  --build-and-test
			
 
				+  "${CMAKE_CURRENT_SOURCE_DIR}/RuntimeControls/"
			
 
				+  "${CMAKE_CURRENT_BINARY_DIR}/RuntimeControls/"
			
 
				+  --build-two-config
			
 
				+    ${build_generator_args}
			
 
				+  --build-project RuntimeControls
			
 
				+  --build-options ${build_options}
			
 
				+  --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
			
 
				+  )
			
--- a/Tests/CudaOnly/RuntimeControls/CMakeLists.txt
+++ b/Tests/CudaOnly/RuntimeControls/CMakeLists.txt
@@ -0,0 +1,60 @@
 
				+cmake_minimum_required(VERSION 3.7)
			
 
				+project (RuntimeControls CUDA)
			
 
				+
			
 
				+# Find nm and dumpbin
			
 
				+if(CMAKE_NM)
			
 
				+  set(dump_command ${CMAKE_NM})
			
 
				+  set(dump_args -g)
			
 
				+else()
			
 
				+  include(GetPrerequisites)
			
 
				+  message(STATUS "calling list_prerequisites to find dumpbin")
			
 
				+  list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
			
 
				+  if(gp_dumpbin)
			
 
				+    set(dump_command ${gp_dumpbin})
			
 
				+    set(dump_args /ARCHIVEMEMBERS)
			
 
				+  endif()
			
 
				+endif()
			
 
				+
			
 
				+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=[compute_30]")
			
 
				+
			
 
				+set(CMAKE_CUDA_STANDARD 11)
			
 
				+set(CMAKE_CUDA_RUNTIME_LIBRARY static)
			
 
				+
			
 
				+if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
			
 
				+  add_library(UsesNoCudaRT SHARED file1.cu)
			
 
				+  set_target_properties(UsesNoCudaRT PROPERTIES CUDA_RUNTIME_LIBRARY none)
			
 
				+endif()
			
 
				+
			
 
				+add_library(UsesStaticCudaRT SHARED file2.cu)
			
 
				+
			
 
				+add_executable(CudaOnlyRuntimeControls main.cu)
			
 
				+set_target_properties(CudaOnlyRuntimeControls PROPERTIES CUDA_RUNTIME_LIBRARY shared)
			
 
				+
			
 
				+target_link_libraries(CudaOnlyRuntimeControls PRIVATE $<TARGET_NAME_IF_EXISTS:UsesNoCudaRT> UsesStaticCudaRT)
			
 
				+
			
 
				+
			
 
				+if(dump_command)
			
 
				+  if(TARGET UsesNoCudaRT)
			
 
				+    add_custom_command(TARGET UsesNoCudaRT POST_BUILD
			
 
				+    COMMAND ${CMAKE_COMMAND}
			
 
				+    -DDUMP_COMMAND=${dump_command}
			
 
				+    -DDUMP_ARGS=${dump_args}
			
 
				+    -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesNoCudaRT>
			
 
				+    -P ${CMAKE_CURRENT_SOURCE_DIR}/no_runtime.cmake
			
 
				+    )
			
 
				+  endif()
			
 
				+  add_custom_command(TARGET UsesStaticCudaRT POST_BUILD
			
 
				+    COMMAND ${CMAKE_COMMAND}
			
 
				+    -DDUMP_COMMAND=${dump_command}
			
 
				+    -DDUMP_ARGS=${dump_args}
			
 
				+    -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesStaticCudaRT>
			
 
				+    -P ${CMAKE_CURRENT_SOURCE_DIR}/uses_static_runtime.cmake
			
 
				+  )
			
 
				+  string(REPLACE ";" "|" dirs "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
			
 
				+  add_custom_command(TARGET CudaOnlyRuntimeControls POST_BUILD
			
 
				+    COMMAND ${CMAKE_COMMAND}
			
 
				+    -DEXEC_PATH=$<TARGET_FILE:CudaOnlyRuntimeControls>
			
 
				+    -DEXTRA_LIB_DIRS="${dirs}"
			
 
				+    -P ${CMAKE_CURRENT_SOURCE_DIR}/verify_runtime.cmake
			
 
				+  )
			
 
				+endif()
			
--- a/Tests/CudaOnly/RuntimeControls/file1.cu
+++ b/Tests/CudaOnly/RuntimeControls/file1.cu
@@ -0,0 +1,18 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+void __global__ file1_kernel(int x, int& r)
			
 
				+{
			
 
				+  r = -x;
			
 
				+}
			
 
				+
			
 
				+EXPORT int file1_launch_kernel(int x)
			
 
				+{
			
 
				+  int r = 0;
			
 
				+  file1_kernel<<<1, 1>>>(x, r);
			
 
				+  return r;
			
 
				+}
			
--- a/Tests/CudaOnly/RuntimeControls/file2.cu
+++ b/Tests/CudaOnly/RuntimeControls/file2.cu
@@ -0,0 +1,18 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+void __global__ file2_kernel(int x, int& r)
			
 
				+{
			
 
				+  r = -x;
			
 
				+}
			
 
				+
			
 
				+EXPORT int file2_launch_kernel(int x)
			
 
				+{
			
 
				+  int r = 0;
			
 
				+  file2_kernel<<<1, 1>>>(x, r);
			
 
				+  return r;
			
 
				+}
			
--- a/Tests/CudaOnly/RuntimeControls/main.cu
+++ b/Tests/CudaOnly/RuntimeControls/main.cu
@@ -0,0 +1,81 @@
 
				+
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include "cuda.h"
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#endif
			
 
				+
			
 
				+#ifndef _WIN32
			
 
				+IMPORT int file1_launch_kernel(int x);
			
 
				+#endif
			
 
				+
			
 
				+IMPORT int file2_launch_kernel(int x);
			
 
				+
			
 
				+int choose_cuda_device()
			
 
				+{
			
 
				+  int nDevices = 0;
			
 
				+  cudaError_t err = cudaGetDeviceCount(&nDevices);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
			
 
				+              << std::endl;
			
 
				+    return 1;
			
 
				+  }
			
 
				+  for (int i = 0; i < nDevices; ++i) {
			
 
				+    cudaDeviceProp prop;
			
 
				+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
			
 
				+    if (err != cudaSuccess) {
			
 
				+      std::cerr << "Could not retrieve properties from CUDA device " << i
			
 
				+                << std::endl;
			
 
				+      return 1;
			
 
				+    }
			
 
				+    std::cout << "prop.major: " << prop.major << std::endl;
			
 
				+    if (prop.major >= 3) {
			
 
				+      err = cudaSetDevice(i);
			
 
				+      if (err != cudaSuccess) {
			
 
				+        std::cout << "Could not select CUDA device " << i << std::endl;
			
 
				+      } else {
			
 
				+        return 0;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
			
 
				+            << std::endl;
			
 
				+
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char** argv)
			
 
				+{
			
 
				+  int ret = choose_cuda_device();
			
 
				+  if (ret) {
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  cudaError_t err;
			
 
				+#ifndef _WIN32
			
 
				+  file1_launch_kernel(1);
			
 
				+  err = cudaGetLastError();
			
 
				+  if (err != cudaSuccess) {
			
 
				+    std::cerr << "file1_launch_kernel: kernel launch should have passed.\n "
			
 
				+                 "Error message: "
			
 
				+              << cudaGetErrorString(err) << std::endl;
			
 
				+    return 1;
			
 
				+  }
			
 
				+#endif
			
 
				+
			
 
				+  file2_launch_kernel(1);
			
 
				+  err = cudaGetLastError();
			
 
				+  if (err != cudaSuccess) {
			
 
				+    std::cerr << "file2_launch_kernel: kernel launch should have passed.\n "
			
 
				+                 "Error message: "
			
 
				+              << cudaGetErrorString(err) << std::endl;
			
 
				+    return 1;
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/Tests/CudaOnly/RuntimeControls/no_runtime.cmake
+++ b/Tests/CudaOnly/RuntimeControls/no_runtime.cmake
@@ -0,0 +1,14 @@
 
				+execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
			
 
				+  RESULT_VARIABLE RESULT
			
 
				+  OUTPUT_VARIABLE OUTPUT
			
 
				+  ERROR_VARIABLE ERROR
			
 
				+)
			
 
				+
			
 
				+if(NOT "${RESULT}" STREQUAL "0")
			
 
				+  message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
			
 
				+endif()
			
 
				+
			
 
				+if(NOT "${OUTPUT}" MATCHES "(__cuda)")
			
 
				+  message(FATAL_ERROR
			
 
				+  "not missing cuda device symbols, static runtime linking was used.")
			
 
				+endif()
			
--- a/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
+++ b/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
@@ -0,0 +1,14 @@
 
				+execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
			
 
				+  RESULT_VARIABLE RESULT
			
 
				+  OUTPUT_VARIABLE OUTPUT
			
 
				+  ERROR_VARIABLE ERROR
			
 
				+)
			
 
				+
			
 
				+if(NOT "${RESULT}" STREQUAL "0")
			
 
				+  message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
			
 
				+endif()
			
 
				+
			
 
				+if("${OUTPUT}" MATCHES "__cuda")
			
 
				+  message(FATAL_ERROR
			
 
				+    "missing cuda device symbols, static runtime linking was not used.")
			
 
				+endif()
			
--- a/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
+++ b/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
@@ -0,0 +1,16 @@
 
				+
			
 
				+string(REPLACE "|" ";" dirs "${EXTRA_LIB_DIRS}")
			
 
				+file(GET_RUNTIME_DEPENDENCIES
			
 
				+  RESOLVED_DEPENDENCIES_VAR resolved_libs
			
 
				+  UNRESOLVED_DEPENDENCIES_VAR unresolved_libs
			
 
				+  DIRECTORIES ${dirs}
			
 
				+  EXECUTABLES ${EXEC_PATH}
			
 
				+  )
			
 
				+
			
 
				+list(FILTER resolved_libs INCLUDE REGEX ".*cudart.*")
			
 
				+list(LENGTH resolved_libs has_cudart)
			
 
				+
			
 
				+if(has_cudart EQUAL 0)
			
 
				+  message(FATAL_ERROR
			
 
				+    "missing cudart shared library from runtime dependency output.")
			
 
				+endif()
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,42 @@
 
				+cmake_minimum_required(VERSION 3.15)
			
 
				+project(SharedRuntimePlusToolkit CUDA)
			
 
				+
			
 
				+#Goal for this example:
			
 
				+# Validate that with c++ we can use some components of the CUDA toolkit, and
			
 
				+# specify the cuda runtime
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+
			
 
				+add_library(Common OBJECT curand.cu nppif.cu)
			
 
				+target_link_libraries(Common PRIVATE CUDA::toolkit)
			
 
				+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+
			
 
				+#shared runtime with shared toolkit libraries
			
 
				+add_library(SharedToolkit SHARED shared.cu)
			
 
				+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
			
 
				+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
			
 
				+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
			
 
				+
			
 
				+# The CUDA only ships the shared version of the toolkit libraries
			
 
				+# on windows
			
 
				+if(NOT WIN32)
			
 
				+  #shared runtime with static toolkit libraries
			
 
				+  add_library(StaticToolkit SHARED static.cu)
			
 
				+  target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
			
 
				+  set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
			
 
				+
			
 
				+  #static runtime with mixed toolkit libraries
			
 
				+  add_library(MixedToolkit SHARED mixed.cu)
			
 
				+  target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
			
 
				+  set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
			
 
				+endif()
			
 
				+
			
 
				+add_executable(CudaOnlySharedRuntimePlusToolkit main.cu)
			
 
				+target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit
			
 
				+                      $<TARGET_NAME_IF_EXISTS:StaticToolkit>
			
 
				+                      $<TARGET_NAME_IF_EXISTS:MixedToolkit>)
			
 
				+
			
 
				+if(UNIX)
			
 
				+  # Help the shared cuda runtime find libcudart as it is not located
			
 
				+  # in a default system searched location
			
 
				+  set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
			
 
				+endif()
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
@@ -0,0 +1,65 @@
 
				+// Comes from:
			
 
				+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * This program uses the host CURAND API to generate 100
			
 
				+ * pseudorandom floats.
			
 
				+ */
			
 
				+#include <cuda.h>
			
 
				+#include <curand.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#define CUDA_CALL(x)                                                          \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != cudaSuccess) {                                                 \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+#define CURAND_CALL(x)                                                        \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+
			
 
				+EXPORT int curand_main()
			
 
				+{
			
 
				+  size_t n = 100;
			
 
				+  size_t i;
			
 
				+  curandGenerator_t gen;
			
 
				+  float *devData, *hostData;
			
 
				+
			
 
				+  /* Allocate n floats on host */
			
 
				+  hostData = (float*)calloc(n, sizeof(float));
			
 
				+
			
 
				+  /* Allocate n floats on device */
			
 
				+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
			
 
				+
			
 
				+  /* Create pseudo-random number generator */
			
 
				+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
			
 
				+
			
 
				+  /* Set seed */
			
 
				+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
			
 
				+
			
 
				+  /* Generate n floats on device */
			
 
				+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
			
 
				+
			
 
				+  /* Copy device memory to host */
			
 
				+  CUDA_CALL(
			
 
				+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
			
 
				+
			
 
				+  /* Cleanup */
			
 
				+  CURAND_CALL(curandDestroyGenerator(gen));
			
 
				+  CUDA_CALL(cudaFree(devData));
			
 
				+  free(hostData);
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
@@ -0,0 +1,23 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+IMPORT int shared_version();
			
 
				+int static_version()
			
 
				+{
			
 
				+  return 0;
			
 
				+}
			
 
				+int mixed_version()
			
 
				+{
			
 
				+  return 0;
			
 
				+}
			
 
				+#else
			
 
				+int shared_version();
			
 
				+int static_version();
			
 
				+int mixed_version();
			
 
				+#endif
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+  return mixed_version() == 0 && shared_version() == 0 &&
			
 
				+    static_version() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+IMPORT int curand_main();
			
 
				+IMPORT int nppif_main();
			
 
				+
			
 
				+EXPORT int mixed_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,92 @@
 
				+// Comes from
			
 
				+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+EXPORT int nppif_main()
			
 
				+{
			
 
				+  /**
			
 
				+   * 8-bit unsigned single-channel 1D row convolution.
			
 
				+   */
			
 
				+  const int simgrows = 32;
			
 
				+  const int simgcols = 32;
			
 
				+  Npp8u *d_pSrc, *d_pDst;
			
 
				+  const int nMaskSize = 3;
			
 
				+  NppiSize oROI;
			
 
				+  oROI.width = simgcols - nMaskSize;
			
 
				+  oROI.height = simgrows;
			
 
				+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
			
 
				+  const int simgpix = simgrows * simgcols;
			
 
				+  const int dimgpix = oROI.width * oROI.height;
			
 
				+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
			
 
				+  const int pixval = 1;
			
 
				+  const int nDivisor = 1;
			
 
				+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
			
 
				+  Npp32s* d_pKernel;
			
 
				+  const Npp32s nAnchor = 2;
			
 
				+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pDst, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // set image to pixval initially
			
 
				+  err = cudaMemset(d_pSrc, pixval, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemset(d_pDst, 0, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
			
 
				+                   cudaMemcpyHostToDevice);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // copy src to dst
			
 
				+  NppStatus ret =
			
 
				+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
			
 
				+                         nMaskSize, nAnchor, nDivisor);
			
 
				+  assert(ret == NPP_NO_ERROR);
			
 
				+  Npp8u* h_imgres = new Npp8u[dimgpix];
			
 
				+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // test for filtering
			
 
				+  for (int i = 0; i < dimgpix; i++) {
			
 
				+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
			
 
				+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
			
 
				+      return 1;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+EXPORT int shared_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+IMPORT int curand_main();
			
 
				+IMPORT int nppif_main();
			
 
				+
			
 
				+EXPORT int static_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,29 @@
 
				+cmake_minimum_required(VERSION 3.15)
			
 
				+project(StaticRuntimePlusToolkit CUDA)
			
 
				+
			
 
				+#Goal for this example:
			
 
				+# Validate that with cuda we can use some components of the CUDA toolkit, and
			
 
				+# specify the cuda runtime
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+
			
 
				+add_library(Common OBJECT curand.cu nppif.cu)
			
 
				+target_link_libraries(Common PRIVATE CUDA::toolkit)
			
 
				+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+
			
 
				+#static runtime with shared toolkit libraries
			
 
				+add_library(SharedToolkit SHARED shared.cu)
			
 
				+target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif )
			
 
				+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
			
 
				+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
			
 
				+
			
 
				+#static runtime with static toolkit libraries
			
 
				+add_library(StaticToolkit SHARED static.cu)
			
 
				+target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
			
 
				+
			
 
				+#static runtime with mixed toolkit libraries
			
 
				+add_library(MixedToolkit SHARED mixed.cu)
			
 
				+target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
			
 
				+set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static)
			
 
				+
			
 
				+add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu)
			
 
				+target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
@@ -0,0 +1,59 @@
 
				+// Comes from:
			
 
				+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
			
 
				+
			
 
				+/*
			
 
				+ * This program uses the host CURAND API to generate 100
			
 
				+ * pseudorandom floats.
			
 
				+ */
			
 
				+#include <cuda.h>
			
 
				+#include <curand.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#define CUDA_CALL(x)                                                          \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != cudaSuccess) {                                                 \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+#define CURAND_CALL(x)                                                        \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+
			
 
				+int curand_main()
			
 
				+{
			
 
				+  size_t n = 100;
			
 
				+  size_t i;
			
 
				+  curandGenerator_t gen;
			
 
				+  float *devData, *hostData;
			
 
				+
			
 
				+  /* Allocate n floats on host */
			
 
				+  hostData = (float*)calloc(n, sizeof(float));
			
 
				+
			
 
				+  /* Allocate n floats on device */
			
 
				+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
			
 
				+
			
 
				+  /* Create pseudo-random number generator */
			
 
				+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
			
 
				+
			
 
				+  /* Set seed */
			
 
				+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
			
 
				+
			
 
				+  /* Generate n floats on device */
			
 
				+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
			
 
				+
			
 
				+  /* Copy device memory to host */
			
 
				+  CUDA_CALL(
			
 
				+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
			
 
				+
			
 
				+  /* Cleanup */
			
 
				+  CURAND_CALL(curandDestroyGenerator(gen));
			
 
				+  CUDA_CALL(cudaFree(devData));
			
 
				+  free(hostData);
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
@@ -0,0 +1,11 @@
 
				+
			
 
				+
			
 
				+int shared_version();
			
 
				+int static_version();
			
 
				+int mixed_version();
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+  return mixed_version() == 0 && shared_version() == 0 &&
			
 
				+    static_version() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int mixed_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,86 @@
 
				+// Comes from
			
 
				+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+int nppif_main()
			
 
				+{
			
 
				+  /**
			
 
				+   * 8-bit unsigned single-channel 1D row convolution.
			
 
				+   */
			
 
				+  const int simgrows = 32;
			
 
				+  const int simgcols = 32;
			
 
				+  Npp8u *d_pSrc, *d_pDst;
			
 
				+  const int nMaskSize = 3;
			
 
				+  NppiSize oROI;
			
 
				+  oROI.width = simgcols - nMaskSize;
			
 
				+  oROI.height = simgrows;
			
 
				+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
			
 
				+  const int simgpix = simgrows * simgcols;
			
 
				+  const int dimgpix = oROI.width * oROI.height;
			
 
				+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
			
 
				+  const int pixval = 1;
			
 
				+  const int nDivisor = 1;
			
 
				+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
			
 
				+  Npp32s* d_pKernel;
			
 
				+  const Npp32s nAnchor = 2;
			
 
				+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pDst, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // set image to pixval initially
			
 
				+  err = cudaMemset(d_pSrc, pixval, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemset(d_pDst, 0, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
			
 
				+                   cudaMemcpyHostToDevice);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // copy src to dst
			
 
				+  NppStatus ret =
			
 
				+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
			
 
				+                         nMaskSize, nAnchor, nDivisor);
			
 
				+  assert(ret == NPP_NO_ERROR);
			
 
				+  Npp8u* h_imgres = new Npp8u[dimgpix];
			
 
				+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // test for filtering
			
 
				+  for (int i = 0; i < dimgpix; i++) {
			
 
				+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
			
 
				+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
			
 
				+      return 1;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int shared_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int static_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}