Forráskód Böngészése

Merge topic 'add_cuda_toolkit_tests'

6e474364d1 CUDAToolkit: No targets now depend on the CUDA runtime
907bb7df57 CUDAToolkit: Gracefully handle missing SDK components
e500eb80cd CUDAToolkit: add_cuda_link_dependency correctly sets dependencies

Acked-by: Kitware Robot <[email protected]>
Merge-request: !4183
Brad King 5 éve
szülő
commit
9f1ce93d92

+ 8 - 12
Modules/FindCUDAToolkit.cmake

@@ -122,7 +122,6 @@ CUDA Runtime Library
 
 
 The CUDA Runtime library (cudart) are what most applications will typically
 The CUDA Runtime library (cudart) are what most applications will typically
 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
-They are an explicit dependency of almost every library.
 
 
 Targets Created:
 Targets Created:
 
 
@@ -708,9 +707,13 @@ if(CUDAToolkit_FOUND)
   endfunction()
   endfunction()
 
 
   function(add_cuda_link_dependency lib_name)
   function(add_cuda_link_dependency lib_name)
-    foreach(dependency IN LISTS ${ARGN})
-      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
-    endforeach()
+    if(TARGET CUDA::${lib_name})
+      foreach(dependency IN LISTS ARGN)
+        if(TARGET CUDA::${dependency})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
+        endif()
+      endforeach()
+    endif()
   endfunction()
   endfunction()
 
 
   add_library(CUDA::toolkit IMPORTED INTERFACE)
   add_library(CUDA::toolkit IMPORTED INTERFACE)
@@ -725,10 +728,8 @@ if(CUDAToolkit_FOUND)
 
 
   foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
   foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
     find_and_add_cuda_import_lib(${cuda_lib})
     find_and_add_cuda_import_lib(${cuda_lib})
-    add_cuda_link_dependency(${cuda_lib} cudart)
 
 
     find_and_add_cuda_import_lib(${cuda_lib}_static)
     find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
   endforeach()
   endforeach()
 
 
   # cuSOLVER depends on cuBLAS, and cuSPARSE
   # cuSOLVER depends on cuBLAS, and cuSPARSE
@@ -742,9 +743,6 @@ if(CUDAToolkit_FOUND)
   find_and_add_cuda_import_lib(nppc)
   find_and_add_cuda_import_lib(nppc)
   find_and_add_cuda_import_lib(nppc_static)
   find_and_add_cuda_import_lib(nppc_static)
 
 
-  add_cuda_link_dependency(nppc cudart)
-  add_cuda_link_dependency(nppc_static cudart_static culibos)
-
   # Process the majority of the NPP libraries.
   # Process the majority of the NPP libraries.
   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
     find_and_add_cuda_import_lib(${cuda_lib})
     find_and_add_cuda_import_lib(${cuda_lib})
@@ -771,13 +769,11 @@ if(CUDAToolkit_FOUND)
   endif()
   endif()
   find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
   find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
 
 
-  add_cuda_link_dependency(nvToolsExt cudart)
-
   find_and_add_cuda_import_lib(OpenCL)
   find_and_add_cuda_import_lib(OpenCL)
 
 
   find_and_add_cuda_import_lib(culibos)
   find_and_add_cuda_import_lib(culibos)
   if(TARGET CUDA::culibos)
   if(TARGET CUDA::culibos)
-    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
+    foreach (cuda_lib cublas cufft cusparse curand nppc nvjpeg)
       add_cuda_link_dependency(${cuda_lib}_static culibos)
       add_cuda_link_dependency(${cuda_lib}_static culibos)
     endforeach()
     endforeach()
   endif()
   endif()

+ 8 - 0
Tests/Cuda/CMakeLists.txt

@@ -14,4 +14,12 @@ ADD_TEST_MACRO(Cuda.Toolkit Toolkit)
 ADD_TEST_MACRO(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
 ADD_TEST_MACRO(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
 ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
 ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
 ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags)
 ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags)
+ADD_TEST_MACRO(Cuda.SharedRuntimePlusToolkit SharedRuntimePlusToolkit)
+
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+  ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
+endif()
+
 ADD_TEST_MACRO(Cuda.WithC CudaWithC)
 ADD_TEST_MACRO(Cuda.WithC CudaWithC)

+ 35 - 0
Tests/Cuda/SharedRuntimePlusToolkit/CMakeLists.txt

@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.15)
+project(SharedRuntimePlusToolkit CXX)
+
+#Goal for this example:
+# Validate that with c++ we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cpp nppif.cpp)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#shared runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cpp)
+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
+
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+  #shared runtime with static toolkit libraries
+  add_library(StaticToolkit SHARED static.cpp)
+  target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+  target_link_libraries(StaticToolkit PUBLIC CUDA::cudart)
+
+  #static runtime with mixed toolkit libraries
+  add_library(MixedToolkit SHARED mixed.cpp)
+  target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
+  target_link_libraries(MixedToolkit PUBLIC CUDA::cudart)
+endif()
+
+add_executable(SharedRuntimePlusToolkit main.cpp)
+target_link_libraries(SharedRuntimePlusToolkit PRIVATE SharedToolkit
+                      $<TARGET_NAME_IF_EXISTS:StaticToolkit>
+                      $<TARGET_NAME_IF_EXISTS:MixedToolkit>)

+ 65 - 0
Tests/Cuda/SharedRuntimePlusToolkit/curand.cpp

@@ -0,0 +1,65 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x)                                                          \
+  do {                                                                        \
+    if ((x) != cudaSuccess) {                                                 \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+#define CURAND_CALL(x)                                                        \
+  do {                                                                        \
+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+
+EXPORT int curand_main()
+{
+  size_t n = 100;
+  size_t i;
+  curandGenerator_t gen;
+  float *devData, *hostData;
+
+  /* Allocate n floats on host */
+  hostData = (float*)calloc(n, sizeof(float));
+
+  /* Allocate n floats on device */
+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+  /* Create pseudo-random number generator */
+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+  /* Set seed */
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+  /* Generate n floats on device */
+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+  /* Copy device memory to host */
+  CUDA_CALL(
+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+  /* Cleanup */
+  CURAND_CALL(curandDestroyGenerator(gen));
+  CUDA_CALL(cudaFree(devData));
+  free(hostData);
+  return EXIT_SUCCESS;
+}

+ 23 - 0
Tests/Cuda/SharedRuntimePlusToolkit/main.cpp

@@ -0,0 +1,23 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+IMPORT int shared_version();
+int static_version()
+{
+  return 0;
+}
+int mixed_version()
+{
+  return 0;
+}
+#else
+int shared_version();
+int static_version();
+int mixed_version();
+#endif
+
+int main()
+{
+  return mixed_version() == 0 && shared_version() == 0 &&
+    static_version() == 0;
+}

+ 16 - 0
Tests/Cuda/SharedRuntimePlusToolkit/mixed.cpp

@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int mixed_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}

+ 92 - 0
Tests/Cuda/SharedRuntimePlusToolkit/nppif.cpp

@@ -0,0 +1,92 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+EXPORT int nppif_main()
+{
+  /**
+   * 8-bit unsigned single-channel 1D row convolution.
+   */
+  const int simgrows = 32;
+  const int simgcols = 32;
+  Npp8u *d_pSrc, *d_pDst;
+  const int nMaskSize = 3;
+  NppiSize oROI;
+  oROI.width = simgcols - nMaskSize;
+  oROI.height = simgrows;
+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+  const int simgpix = simgrows * simgcols;
+  const int dimgpix = oROI.width * oROI.height;
+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+  const int pixval = 1;
+  const int nDivisor = 1;
+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+  Npp32s* d_pKernel;
+  const Npp32s nAnchor = 2;
+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pDst, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // set image to pixval initially
+  err = cudaMemset(d_pSrc, pixval, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemset(d_pDst, 0, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // copy src to dst
+  NppStatus ret =
+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+                         nMaskSize, nAnchor, nDivisor);
+  assert(ret == NPP_NO_ERROR);
+  Npp8u* h_imgres = new Npp8u[dimgpix];
+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // test for filtering
+  for (int i = 0; i < dimgpix; i++) {
+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+      return 1;
+    }
+  }
+
+  return 0;
+}

+ 16 - 0
Tests/Cuda/SharedRuntimePlusToolkit/shared.cpp

@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+int curand_main();
+int nppif_main();
+
+EXPORT int shared_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}

+ 16 - 0
Tests/Cuda/SharedRuntimePlusToolkit/static.cpp

@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int static_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}

+ 29 - 0
Tests/Cuda/StaticRuntimePlusToolkit/CMakeLists.txt

@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.15)
+project(StaticRuntimePlusToolkit CXX)
+
+#Goal for this example:
+# Validate that with c++ we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cpp nppif.cpp)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#static runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cpp)
+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
+
+#static runtime with static toolkit libraries
+add_library(StaticToolkit SHARED static.cpp)
+target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+target_link_libraries(StaticToolkit PUBLIC CUDA::cudart_static)
+
+#static runtime with mixed toolkit libraries
+add_library(MixedToolkit SHARED mixed.cpp)
+target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
+target_link_libraries(MixedToolkit PUBLIC CUDA::cudart_static)
+
+add_executable(StaticRuntimePlusToolkit main.cpp)
+target_link_libraries(StaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)

+ 59 - 0
Tests/Cuda/StaticRuntimePlusToolkit/curand.cpp

@@ -0,0 +1,59 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x)                                                          \
+  do {                                                                        \
+    if ((x) != cudaSuccess) {                                                 \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+#define CURAND_CALL(x)                                                        \
+  do {                                                                        \
+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+
+int curand_main()
+{
+  size_t n = 100;
+  size_t i;
+  curandGenerator_t gen;
+  float *devData, *hostData;
+
+  /* Allocate n floats on host */
+  hostData = (float*)calloc(n, sizeof(float));
+
+  /* Allocate n floats on device */
+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+  /* Create pseudo-random number generator */
+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+  /* Set seed */
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+  /* Generate n floats on device */
+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+  /* Copy device memory to host */
+  CUDA_CALL(
+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+  /* Cleanup */
+  CURAND_CALL(curandDestroyGenerator(gen));
+  CUDA_CALL(cudaFree(devData));
+  free(hostData);
+  return EXIT_SUCCESS;
+}

+ 11 - 0
Tests/Cuda/StaticRuntimePlusToolkit/main.cpp

@@ -0,0 +1,11 @@
+
+
+int shared_version();
+int static_version();
+int mixed_version();
+
+int main()
+{
+  return mixed_version() == 0 && shared_version() == 0 &&
+    static_version() == 0;
+}

+ 8 - 0
Tests/Cuda/StaticRuntimePlusToolkit/mixed.cpp

@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int mixed_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}

+ 86 - 0
Tests/Cuda/StaticRuntimePlusToolkit/nppif.cpp

@@ -0,0 +1,86 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+int nppif_main()
+{
+  /**
+   * 8-bit unsigned single-channel 1D row convolution.
+   */
+  const int simgrows = 32;
+  const int simgcols = 32;
+  Npp8u *d_pSrc, *d_pDst;
+  const int nMaskSize = 3;
+  NppiSize oROI;
+  oROI.width = simgcols - nMaskSize;
+  oROI.height = simgrows;
+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+  const int simgpix = simgrows * simgcols;
+  const int dimgpix = oROI.width * oROI.height;
+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+  const int pixval = 1;
+  const int nDivisor = 1;
+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+  Npp32s* d_pKernel;
+  const Npp32s nAnchor = 2;
+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pDst, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // set image to pixval initially
+  err = cudaMemset(d_pSrc, pixval, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemset(d_pDst, 0, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // copy src to dst
+  NppStatus ret =
+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+                         nMaskSize, nAnchor, nDivisor);
+  assert(ret == NPP_NO_ERROR);
+  Npp8u* h_imgres = new Npp8u[dimgpix];
+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // test for filtering
+  for (int i = 0; i < dimgpix; i++) {
+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+      return 1;
+    }
+  }
+
+  return 0;
+}

+ 8 - 0
Tests/Cuda/StaticRuntimePlusToolkit/shared.cpp

@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int shared_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}

+ 8 - 0
Tests/Cuda/StaticRuntimePlusToolkit/static.cpp

@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int static_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}