6 年之前 · 9f1ce93d92
--- a/Modules/FindCUDAToolkit.cmake
+++ b/Modules/FindCUDAToolkit.cmake
@@ -122,7 +122,6 @@ CUDA Runtime Library
 
				 
			
 
				 The CUDA Runtime library (cudart) are what most applications will typically
			
 
				 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
			
 
				-They are an explicit dependency of almost every library.
			
 
				 
			
 
				 Targets Created:
			
 
				 
			
@@ -708,9 +707,13 @@ if(CUDAToolkit_FOUND)
 
				   endfunction()
			
 
				 
			
 
				   function(add_cuda_link_dependency lib_name)
			
 
				-    foreach(dependency IN LISTS ${ARGN})
			
 
				-      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
			
 
				-    endforeach()
			
 
				+    if(TARGET CUDA::${lib_name})
			
 
				+      foreach(dependency IN LISTS ARGN)
			
 
				+        if(TARGET CUDA::${dependency})
			
 
				+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
			
 
				+        endif()
			
 
				+      endforeach()
			
 
				+    endif()
			
 
				   endfunction()
			
 
				 
			
 
				   add_library(CUDA::toolkit IMPORTED INTERFACE)
			
@@ -725,10 +728,8 @@ if(CUDAToolkit_FOUND)
 
				 
			
 
				   foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
			
 
				     find_and_add_cuda_import_lib(${cuda_lib})
			
 
				-    add_cuda_link_dependency(${cuda_lib} cudart)
			
 
				 
			
 
				     find_and_add_cuda_import_lib(${cuda_lib}_static)
			
 
				-    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
			
 
				   endforeach()
			
 
				 
			
 
				   # cuSOLVER depends on cuBLAS, and cuSPARSE
			
@@ -742,9 +743,6 @@ if(CUDAToolkit_FOUND)
 
				   find_and_add_cuda_import_lib(nppc)
			
 
				   find_and_add_cuda_import_lib(nppc_static)
			
 
				 
			
 
				-  add_cuda_link_dependency(nppc cudart)
			
 
				-  add_cuda_link_dependency(nppc_static cudart_static culibos)
			
 
				-
			
 
				   # Process the majority of the NPP libraries.
			
 
				   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
			
 
				     find_and_add_cuda_import_lib(${cuda_lib})
			
@@ -771,13 +769,11 @@ if(CUDAToolkit_FOUND)
 
				   endif()
			
 
				   find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
			
 
				 
			
 
				-  add_cuda_link_dependency(nvToolsExt cudart)
			
 
				-
			
 
				   find_and_add_cuda_import_lib(OpenCL)
			
 
				 
			
 
				   find_and_add_cuda_import_lib(culibos)
			
 
				   if(TARGET CUDA::culibos)
			
 
				-    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
			
 
				+    foreach (cuda_lib cublas cufft cusparse curand nppc nvjpeg)
			
 
				       add_cuda_link_dependency(${cuda_lib}_static culibos)
			
 
				     endforeach()
			
 
				   endif()
			
--- a/Tests/Cuda/CMakeLists.txt
+++ b/Tests/Cuda/CMakeLists.txt
@@ -14,4 +14,12 @@ ADD_TEST_MACRO(Cuda.Toolkit Toolkit)
 
				 ADD_TEST_MACRO(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
			
 
				 ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
			
 
				 ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags)
			
 
				+ADD_TEST_MACRO(Cuda.SharedRuntimePlusToolkit SharedRuntimePlusToolkit)
			
 
				+
			
 
				+# The CUDA only ships the shared version of the toolkit libraries
			
 
				+# on windows
			
 
				+if(NOT WIN32)
			
 
				+  ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
			
 
				+endif()
			
 
				+
			
 
				 ADD_TEST_MACRO(Cuda.WithC CudaWithC)
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/CMakeLists.txt
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,35 @@
 
				+cmake_minimum_required(VERSION 3.15)
			
 
				+project(SharedRuntimePlusToolkit CXX)
			
 
				+
			
 
				+#Goal for this example:
			
 
				+# Validate that with c++ we can use some components of the CUDA toolkit, and
			
 
				+# specify the cuda runtime
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+
			
 
				+add_library(Common OBJECT curand.cpp nppif.cpp)
			
 
				+target_link_libraries(Common PRIVATE CUDA::toolkit)
			
 
				+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+
			
 
				+#shared runtime with shared toolkit libraries
			
 
				+add_library(SharedToolkit SHARED shared.cpp)
			
 
				+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
			
 
				+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
			
 
				+
			
 
				+# The CUDA only ships the shared version of the toolkit libraries
			
 
				+# on windows
			
 
				+if(NOT WIN32)
			
 
				+  #shared runtime with static toolkit libraries
			
 
				+  add_library(StaticToolkit SHARED static.cpp)
			
 
				+  target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
			
 
				+  target_link_libraries(StaticToolkit PUBLIC CUDA::cudart)
			
 
				+
			
 
				+  #static runtime with mixed toolkit libraries
			
 
				+  add_library(MixedToolkit SHARED mixed.cpp)
			
 
				+  target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
			
 
				+  target_link_libraries(MixedToolkit PUBLIC CUDA::cudart)
			
 
				+endif()
			
 
				+
			
 
				+add_executable(SharedRuntimePlusToolkit main.cpp)
			
 
				+target_link_libraries(SharedRuntimePlusToolkit PRIVATE SharedToolkit
			
 
				+                      $<TARGET_NAME_IF_EXISTS:StaticToolkit>
			
 
				+                      $<TARGET_NAME_IF_EXISTS:MixedToolkit>)
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/curand.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/curand.cpp
@@ -0,0 +1,65 @@
 
				+// Comes from:
			
 
				+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * This program uses the host CURAND API to generate 100
			
 
				+ * pseudorandom floats.
			
 
				+ */
			
 
				+#include <cuda.h>
			
 
				+#include <curand.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#define CUDA_CALL(x)                                                          \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != cudaSuccess) {                                                 \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+#define CURAND_CALL(x)                                                        \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+
			
 
				+EXPORT int curand_main()
			
 
				+{
			
 
				+  size_t n = 100;
			
 
				+  size_t i;
			
 
				+  curandGenerator_t gen;
			
 
				+  float *devData, *hostData;
			
 
				+
			
 
				+  /* Allocate n floats on host */
			
 
				+  hostData = (float*)calloc(n, sizeof(float));
			
 
				+
			
 
				+  /* Allocate n floats on device */
			
 
				+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
			
 
				+
			
 
				+  /* Create pseudo-random number generator */
			
 
				+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
			
 
				+
			
 
				+  /* Set seed */
			
 
				+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
			
 
				+
			
 
				+  /* Generate n floats on device */
			
 
				+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
			
 
				+
			
 
				+  /* Copy device memory to host */
			
 
				+  CUDA_CALL(
			
 
				+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
			
 
				+
			
 
				+  /* Cleanup */
			
 
				+  CURAND_CALL(curandDestroyGenerator(gen));
			
 
				+  CUDA_CALL(cudaFree(devData));
			
 
				+  free(hostData);
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/main.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/main.cpp
@@ -0,0 +1,23 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+IMPORT int shared_version();
			
 
				+int static_version()
			
 
				+{
			
 
				+  return 0;
			
 
				+}
			
 
				+int mixed_version()
			
 
				+{
			
 
				+  return 0;
			
 
				+}
			
 
				+#else
			
 
				+int shared_version();
			
 
				+int static_version();
			
 
				+int mixed_version();
			
 
				+#endif
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+  return mixed_version() == 0 && shared_version() == 0 &&
			
 
				+    static_version() == 0;
			
 
				+}
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/mixed.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/mixed.cpp
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+IMPORT int curand_main();
			
 
				+IMPORT int nppif_main();
			
 
				+
			
 
				+EXPORT int mixed_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/nppif.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/nppif.cpp
@@ -0,0 +1,92 @@
 
				+// Comes from
			
 
				+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+EXPORT int nppif_main()
			
 
				+{
			
 
				+  /**
			
 
				+   * 8-bit unsigned single-channel 1D row convolution.
			
 
				+   */
			
 
				+  const int simgrows = 32;
			
 
				+  const int simgcols = 32;
			
 
				+  Npp8u *d_pSrc, *d_pDst;
			
 
				+  const int nMaskSize = 3;
			
 
				+  NppiSize oROI;
			
 
				+  oROI.width = simgcols - nMaskSize;
			
 
				+  oROI.height = simgrows;
			
 
				+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
			
 
				+  const int simgpix = simgrows * simgcols;
			
 
				+  const int dimgpix = oROI.width * oROI.height;
			
 
				+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
			
 
				+  const int pixval = 1;
			
 
				+  const int nDivisor = 1;
			
 
				+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
			
 
				+  Npp32s* d_pKernel;
			
 
				+  const Npp32s nAnchor = 2;
			
 
				+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pDst, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // set image to pixval initially
			
 
				+  err = cudaMemset(d_pSrc, pixval, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemset(d_pDst, 0, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
			
 
				+                   cudaMemcpyHostToDevice);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // copy src to dst
			
 
				+  NppStatus ret =
			
 
				+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
			
 
				+                         nMaskSize, nAnchor, nDivisor);
			
 
				+  assert(ret == NPP_NO_ERROR);
			
 
				+  Npp8u* h_imgres = new Npp8u[dimgpix];
			
 
				+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // test for filtering
			
 
				+  for (int i = 0; i < dimgpix; i++) {
			
 
				+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
			
 
				+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
			
 
				+      return 1;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/shared.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/shared.cpp
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+EXPORT int shared_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/Cuda/SharedRuntimePlusToolkit/static.cpp
+++ b/Tests/Cuda/SharedRuntimePlusToolkit/static.cpp
@@ -0,0 +1,16 @@
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#  define IMPORT __declspec(dllimport)
			
 
				+#  define EXPORT __declspec(dllexport)
			
 
				+#else
			
 
				+#  define IMPORT
			
 
				+#  define EXPORT
			
 
				+#endif
			
 
				+
			
 
				+IMPORT int curand_main();
			
 
				+IMPORT int nppif_main();
			
 
				+
			
 
				+EXPORT int static_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/CMakeLists.txt
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,29 @@
 
				+cmake_minimum_required(VERSION 3.15)
			
 
				+project(StaticRuntimePlusToolkit CXX)
			
 
				+
			
 
				+#Goal for this example:
			
 
				+# Validate that with c++ we can use some components of the CUDA toolkit, and
			
 
				+# specify the cuda runtime
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+
			
 
				+add_library(Common OBJECT curand.cpp nppif.cpp)
			
 
				+target_link_libraries(Common PRIVATE CUDA::toolkit)
			
 
				+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+
			
 
				+#static runtime with shared toolkit libraries
			
 
				+add_library(SharedToolkit SHARED shared.cpp)
			
 
				+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
			
 
				+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
			
 
				+
			
 
				+#static runtime with static toolkit libraries
			
 
				+add_library(StaticToolkit SHARED static.cpp)
			
 
				+target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
			
 
				+target_link_libraries(StaticToolkit PUBLIC CUDA::cudart_static)
			
 
				+
			
 
				+#static runtime with mixed toolkit libraries
			
 
				+add_library(MixedToolkit SHARED mixed.cpp)
			
 
				+target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
			
 
				+target_link_libraries(MixedToolkit PUBLIC CUDA::cudart_static)
			
 
				+
			
 
				+add_executable(StaticRuntimePlusToolkit main.cpp)
			
 
				+target_link_libraries(StaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/curand.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/curand.cpp
@@ -0,0 +1,59 @@
 
				+// Comes from:
			
 
				+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
			
 
				+
			
 
				+/*
			
 
				+ * This program uses the host CURAND API to generate 100
			
 
				+ * pseudorandom floats.
			
 
				+ */
			
 
				+#include <cuda.h>
			
 
				+#include <curand.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#define CUDA_CALL(x)                                                          \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != cudaSuccess) {                                                 \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+#define CURAND_CALL(x)                                                        \
			
 
				+  do {                                                                        \
			
 
				+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
			
 
				+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
			
 
				+      return EXIT_FAILURE;                                                    \
			
 
				+    }                                                                         \
			
 
				+  } while (0)
			
 
				+
			
 
				+int curand_main()
			
 
				+{
			
 
				+  size_t n = 100;
			
 
				+  size_t i;
			
 
				+  curandGenerator_t gen;
			
 
				+  float *devData, *hostData;
			
 
				+
			
 
				+  /* Allocate n floats on host */
			
 
				+  hostData = (float*)calloc(n, sizeof(float));
			
 
				+
			
 
				+  /* Allocate n floats on device */
			
 
				+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
			
 
				+
			
 
				+  /* Create pseudo-random number generator */
			
 
				+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
			
 
				+
			
 
				+  /* Set seed */
			
 
				+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
			
 
				+
			
 
				+  /* Generate n floats on device */
			
 
				+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
			
 
				+
			
 
				+  /* Copy device memory to host */
			
 
				+  CUDA_CALL(
			
 
				+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
			
 
				+
			
 
				+  /* Cleanup */
			
 
				+  CURAND_CALL(curandDestroyGenerator(gen));
			
 
				+  CUDA_CALL(cudaFree(devData));
			
 
				+  free(hostData);
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/main.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/main.cpp
@@ -0,0 +1,11 @@
 
				+
			
 
				+
			
 
				+int shared_version();
			
 
				+int static_version();
			
 
				+int mixed_version();
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+  return mixed_version() == 0 && shared_version() == 0 &&
			
 
				+    static_version() == 0;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/mixed.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/mixed.cpp
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int mixed_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/nppif.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/nppif.cpp
@@ -0,0 +1,86 @@
 
				+// Comes from
			
 
				+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+int nppif_main()
			
 
				+{
			
 
				+  /**
			
 
				+   * 8-bit unsigned single-channel 1D row convolution.
			
 
				+   */
			
 
				+  const int simgrows = 32;
			
 
				+  const int simgcols = 32;
			
 
				+  Npp8u *d_pSrc, *d_pDst;
			
 
				+  const int nMaskSize = 3;
			
 
				+  NppiSize oROI;
			
 
				+  oROI.width = simgcols - nMaskSize;
			
 
				+  oROI.height = simgrows;
			
 
				+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
			
 
				+  const int simgpix = simgrows * simgcols;
			
 
				+  const int dimgpix = oROI.width * oROI.height;
			
 
				+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
			
 
				+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
			
 
				+  const int pixval = 1;
			
 
				+  const int nDivisor = 1;
			
 
				+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
			
 
				+  Npp32s* d_pKernel;
			
 
				+  const Npp32s nAnchor = 2;
			
 
				+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pDst, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // set image to pixval initially
			
 
				+  err = cudaMemset(d_pSrc, pixval, simgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemset(d_pDst, 0, dimgsize);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
			
 
				+                   cudaMemcpyHostToDevice);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // copy src to dst
			
 
				+  NppStatus ret =
			
 
				+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
			
 
				+                         nMaskSize, nAnchor, nDivisor);
			
 
				+  assert(ret == NPP_NO_ERROR);
			
 
				+  Npp8u* h_imgres = new Npp8u[dimgpix];
			
 
				+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    fprintf(stderr, "Cuda error %d\n", __LINE__);
			
 
				+    return 1;
			
 
				+  }
			
 
				+  // test for filtering
			
 
				+  for (int i = 0; i < dimgpix; i++) {
			
 
				+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
			
 
				+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
			
 
				+      return 1;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/shared.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/shared.cpp
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int shared_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}
			
--- a/Tests/Cuda/StaticRuntimePlusToolkit/static.cpp
+++ b/Tests/Cuda/StaticRuntimePlusToolkit/static.cpp
@@ -0,0 +1,8 @@
 
				+
			
 
				+int curand_main();
			
 
				+int nppif_main();
			
 
				+
			
 
				+int static_version()
			
 
				+{
			
 
				+  return curand_main() == 0 && nppif_main() == 0;
			
 
				+}