|
|
@@ -0,0 +1,92 @@
|
|
|
+// Comes from
|
|
|
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
|
|
|
+
|
|
|
+#ifdef _WIN32
|
|
|
+# define EXPORT __declspec(dllexport)
|
|
|
+#else
|
|
|
+# define EXPORT
|
|
|
+#endif
|
|
|
+
|
|
|
+#include <cstdio>
|
|
|
+#include <iostream>
|
|
|
+
|
|
|
+#include <assert.h>
|
|
|
+#include <cuda_runtime_api.h>
|
|
|
+#include <nppi_filtering_functions.h>
|
|
|
+
|
|
|
+EXPORT int nppif_main()
|
|
|
+{
|
|
|
+ /**
|
|
|
+ * 8-bit unsigned single-channel 1D row convolution.
|
|
|
+ */
|
|
|
+ const int simgrows = 32;
|
|
|
+ const int simgcols = 32;
|
|
|
+ Npp8u *d_pSrc, *d_pDst;
|
|
|
+ const int nMaskSize = 3;
|
|
|
+ NppiSize oROI;
|
|
|
+ oROI.width = simgcols - nMaskSize;
|
|
|
+ oROI.height = simgrows;
|
|
|
+ const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
|
|
|
+ const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
|
|
|
+ const int simgpix = simgrows * simgcols;
|
|
|
+ const int dimgpix = oROI.width * oROI.height;
|
|
|
+ const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
|
|
|
+ const int nDstStep = oROI.width * sizeof(d_pDst[0]);
|
|
|
+ const int pixval = 1;
|
|
|
+ const int nDivisor = 1;
|
|
|
+ const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
|
|
|
+ Npp32s* d_pKernel;
|
|
|
+ const Npp32s nAnchor = 2;
|
|
|
+ cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ err = cudaMalloc((void**)&d_pDst, dimgsize);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ // set image to pixval initially
|
|
|
+ err = cudaMemset(d_pSrc, pixval, simgsize);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ err = cudaMemset(d_pDst, 0, dimgsize);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
|
|
|
+ cudaMemcpyHostToDevice);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ // copy src to dst
|
|
|
+ NppStatus ret =
|
|
|
+ nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
|
|
|
+ nMaskSize, nAnchor, nDivisor);
|
|
|
+ assert(ret == NPP_NO_ERROR);
|
|
|
+ Npp8u* h_imgres = new Npp8u[dimgpix];
|
|
|
+ err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ // test for filtering
|
|
|
+ for (int i = 0; i < dimgpix; i++) {
|
|
|
+ if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
|
|
|
+ fprintf(stderr, "h_imgres at index %d failed to match\n", i);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|