1
0

nvenc-cuda.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #include "nvenc-internal.h"
  2. #include "nvenc-helpers.h"
  3. /*
  4. * NVENC implementation using CUDA context and arrays
  5. */
  6. /* ------------------------------------------------------------------------- */
  7. /* CUDA Context management */
  8. bool cuda_ctx_init(struct nvenc_data *enc, obs_data_t *settings, const bool texture)
  9. {
  10. #ifdef _WIN32
  11. if (texture)
  12. return true;
  13. #endif
  14. int count;
  15. CUdevice device;
  16. int gpu = (int)obs_data_get_int(settings, "device");
  17. #ifndef _WIN32
  18. /* CUDA can do fairly efficient cross-GPU OpenGL mappings, allow it as
  19. * a hidden option for experimentation. */
  20. bool force_cuda_tex = obs_data_get_bool(settings, "force_cuda_tex");
  21. #endif
  22. if (gpu == -1)
  23. gpu = 0;
  24. CU_FAILED(cu->cuInit(0))
  25. CU_FAILED(cu->cuDeviceGetCount(&count))
  26. if (!count) {
  27. NV_FAIL("No CUDA devices found");
  28. return false;
  29. }
  30. #ifdef _WIN32
  31. CU_FAILED(cu->cuDeviceGet(&device, gpu))
  32. #else
  33. if (!texture || force_cuda_tex) {
  34. CU_FAILED(cu->cuDeviceGet(&device, gpu))
  35. } else {
  36. unsigned int ctx_count = 0;
  37. CUdevice devices[2];
  38. obs_enter_graphics();
  39. CUresult res = cu->cuGLGetDevices(&ctx_count, devices, 2, CU_GL_DEVICE_LIST_ALL);
  40. obs_leave_graphics();
  41. if (res != CUDA_SUCCESS || !ctx_count) {
  42. /* Probably running on iGPU, should just fall back to
  43. * non-texture encoder. */
  44. if (res == CUDA_ERROR_INVALID_GRAPHICS_CONTEXT) {
  45. info("Not running on NVIDIA GPU, falling back "
  46. "to non-texture encoder");
  47. } else {
  48. const char *name, *desc;
  49. if (cuda_get_error_desc(res, &name, &desc)) {
  50. error("Failed to get a CUDA device for "
  51. "the current OpenGL context: "
  52. "%s: %s",
  53. name, desc);
  54. } else {
  55. error("Failed to get a CUDA device for "
  56. "the current OpenGL context: %d",
  57. res);
  58. }
  59. }
  60. return false;
  61. }
  62. /* Documentation indicates this should only ever happen with
  63. * SLI, i.e. never for OBS. */
  64. if (ctx_count > 1) {
  65. warn("Got more than one CUDA devices for OpenGL context,"
  66. " this is untested.");
  67. }
  68. device = devices[0];
  69. debug("Loading up CUDA on device %u", device);
  70. }
  71. #endif
  72. CU_FAILED(cu->cuCtxCreate(&enc->cu_ctx, 0, device))
  73. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  74. return true;
  75. }
  76. void cuda_ctx_free(struct nvenc_data *enc)
  77. {
  78. if (enc->cu_ctx) {
  79. cu->cuCtxPopCurrent(NULL);
  80. cu->cuCtxDestroy(enc->cu_ctx);
  81. }
  82. }
  83. /* ------------------------------------------------------------------------- */
  84. /* CUDA Surface management */
  85. static bool cuda_surface_init(struct nvenc_data *enc, struct nv_cuda_surface *nvsurf)
  86. {
  87. const bool p010 = obs_encoder_video_tex_active(enc->encoder, VIDEO_FORMAT_P010);
  88. CUDA_ARRAY3D_DESCRIPTOR desc;
  89. desc.Width = enc->cx;
  90. desc.Height = enc->cy;
  91. desc.Depth = 0;
  92. desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
  93. desc.NumChannels = 1;
  94. if (!enc->non_texture) {
  95. desc.Format = p010 ? CU_AD_FORMAT_UNSIGNED_INT16 : CU_AD_FORMAT_UNSIGNED_INT8;
  96. desc.Height = enc->cy + enc->cy / 2;
  97. } else {
  98. switch (enc->surface_format) {
  99. case NV_ENC_BUFFER_FORMAT_NV12:
  100. desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
  101. // Additional half-height plane for UV data
  102. desc.Height += enc->cy / 2;
  103. break;
  104. case NV_ENC_BUFFER_FORMAT_YUV420_10BIT:
  105. desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; // 2 bytes per element
  106. desc.Height += enc->cy / 2;
  107. break;
  108. case NV_ENC_BUFFER_FORMAT_YUV444:
  109. desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
  110. desc.Height *= 3; // 3 full-size planes
  111. break;
  112. default:
  113. error("Unknown input format: %d", enc->surface_format);
  114. return false;
  115. }
  116. }
  117. CU_FAILED(cu->cuArray3DCreate(&nvsurf->tex, &desc))
  118. NV_ENC_REGISTER_RESOURCE res = {0};
  119. res.version = NV_ENC_REGISTER_RESOURCE_VER;
  120. res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDAARRAY;
  121. res.resourceToRegister = (void *)nvsurf->tex;
  122. res.width = enc->cx;
  123. res.height = enc->cy;
  124. res.pitch = (uint32_t)(desc.Width * desc.NumChannels);
  125. if (!enc->non_texture) {
  126. res.bufferFormat = p010 ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12;
  127. } else {
  128. res.bufferFormat = enc->surface_format;
  129. }
  130. if (NV_FAILED(nv.nvEncRegisterResource(enc->session, &res))) {
  131. return false;
  132. }
  133. nvsurf->res = res.registeredResource;
  134. nvsurf->mapped_res = NULL;
  135. return true;
  136. }
  137. bool cuda_init_surfaces(struct nvenc_data *enc)
  138. {
  139. switch (enc->in_format) {
  140. case VIDEO_FORMAT_P010:
  141. enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
  142. break;
  143. case VIDEO_FORMAT_I444:
  144. enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV444;
  145. break;
  146. default:
  147. enc->surface_format = NV_ENC_BUFFER_FORMAT_NV12;
  148. }
  149. da_reserve(enc->surfaces, enc->buf_count);
  150. CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
  151. for (uint32_t i = 0; i < enc->buf_count; i++) {
  152. struct nv_cuda_surface buf;
  153. if (!cuda_surface_init(enc, &buf)) {
  154. return false;
  155. }
  156. da_push_back(enc->surfaces, &buf);
  157. }
  158. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  159. return true;
  160. }
  161. static void cuda_surface_free(struct nvenc_data *enc, struct nv_cuda_surface *nvsurf)
  162. {
  163. if (nvsurf->res) {
  164. if (nvsurf->mapped_res) {
  165. nv.nvEncUnmapInputResource(enc->session, nvsurf->mapped_res);
  166. }
  167. nv.nvEncUnregisterResource(enc->session, nvsurf->res);
  168. cu->cuArrayDestroy(nvsurf->tex);
  169. }
  170. }
  171. void cuda_free_surfaces(struct nvenc_data *enc)
  172. {
  173. if (!enc->cu_ctx)
  174. return;
  175. cu->cuCtxPushCurrent(enc->cu_ctx);
  176. for (size_t i = 0; i < enc->surfaces.num; i++) {
  177. cuda_surface_free(enc, &enc->surfaces.array[i]);
  178. }
  179. cu->cuCtxPopCurrent(NULL);
  180. }
  181. /* ------------------------------------------------------------------------- */
  182. /* Actual encoding stuff */
  183. static inline bool copy_frame(struct nvenc_data *enc, struct encoder_frame *frame, struct nv_cuda_surface *surf)
  184. {
  185. bool success = true;
  186. size_t height = enc->cy;
  187. size_t width = enc->cx;
  188. CUDA_MEMCPY2D m = {0};
  189. m.srcMemoryType = CU_MEMORYTYPE_HOST;
  190. m.dstMemoryType = CU_MEMORYTYPE_ARRAY;
  191. m.dstArray = surf->tex;
  192. m.WidthInBytes = width;
  193. m.Height = height;
  194. CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
  195. if (enc->surface_format == NV_ENC_BUFFER_FORMAT_NV12) {
  196. /* Page-locks the host memory so that it can be DMAd directly
  197. * rather than CUDA doing an internal copy to page-locked
  198. * memory before actually DMA-ing to the GPU. */
  199. CU_CHECK(cu->cuMemHostRegister(frame->data[0], frame->linesize[0] * height, 0))
  200. CU_CHECK(cu->cuMemHostRegister(frame->data[1], frame->linesize[1] * height / 2, 0))
  201. m.srcPitch = frame->linesize[0];
  202. m.srcHost = frame->data[0];
  203. CU_FAILED(cu->cuMemcpy2D(&m))
  204. m.srcPitch = frame->linesize[1];
  205. m.srcHost = frame->data[1];
  206. m.dstY += height;
  207. m.Height /= 2;
  208. CU_FAILED(cu->cuMemcpy2D(&m))
  209. } else if (enc->surface_format == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) {
  210. CU_CHECK(cu->cuMemHostRegister(frame->data[0], frame->linesize[0] * height, 0))
  211. CU_CHECK(cu->cuMemHostRegister(frame->data[1], frame->linesize[1] * height / 2, 0))
  212. // P010 lines are double the size (16 bit per pixel)
  213. m.WidthInBytes *= 2;
  214. m.srcPitch = frame->linesize[0];
  215. m.srcHost = frame->data[0];
  216. CU_FAILED(cu->cuMemcpy2D(&m))
  217. m.srcPitch = frame->linesize[1];
  218. m.srcHost = frame->data[1];
  219. m.dstY += height;
  220. m.Height /= 2;
  221. CU_FAILED(cu->cuMemcpy2D(&m))
  222. } else { // I444
  223. CU_CHECK(cu->cuMemHostRegister(frame->data[0], frame->linesize[0] * height, 0))
  224. CU_CHECK(cu->cuMemHostRegister(frame->data[1], frame->linesize[1] * height, 0))
  225. CU_CHECK(cu->cuMemHostRegister(frame->data[2], frame->linesize[2] * height, 0))
  226. m.srcPitch = frame->linesize[0];
  227. m.srcHost = frame->data[0];
  228. CU_FAILED(cu->cuMemcpy2D(&m))
  229. m.srcPitch = frame->linesize[1];
  230. m.srcHost = frame->data[1];
  231. m.dstY += height;
  232. CU_FAILED(cu->cuMemcpy2D(&m))
  233. m.srcPitch = frame->linesize[2];
  234. m.srcHost = frame->data[2];
  235. m.dstY += height;
  236. CU_FAILED(cu->cuMemcpy2D(&m))
  237. }
  238. unmap:
  239. if (frame->data[0])
  240. cu->cuMemHostUnregister(frame->data[0]);
  241. if (frame->data[1])
  242. cu->cuMemHostUnregister(frame->data[1]);
  243. if (frame->data[2])
  244. cu->cuMemHostUnregister(frame->data[2]);
  245. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  246. return success;
  247. }
  248. bool cuda_encode(void *data, struct encoder_frame *frame, struct encoder_packet *packet, bool *received_packet)
  249. {
  250. struct nvenc_data *enc = data;
  251. struct nv_cuda_surface *surf;
  252. struct nv_bitstream *bs;
  253. bs = &enc->bitstreams.array[enc->next_bitstream];
  254. surf = &enc->surfaces.array[enc->next_bitstream];
  255. deque_push_back(&enc->dts_list, &frame->pts, sizeof(frame->pts));
  256. /* ------------------------------------ */
  257. /* copy to CUDA surface */
  258. if (!copy_frame(enc, frame, surf))
  259. return false;
  260. /* ------------------------------------ */
  261. /* map output tex so nvenc can use it */
  262. NV_ENC_MAP_INPUT_RESOURCE map = {NV_ENC_MAP_INPUT_RESOURCE_VER};
  263. map.registeredResource = surf->res;
  264. map.mappedBufferFmt = enc->surface_format;
  265. if (NV_FAILED(nv.nvEncMapInputResource(enc->session, &map)))
  266. return false;
  267. surf->mapped_res = map.mappedResource;
  268. /* ------------------------------------ */
  269. /* do actual encode call */
  270. return nvenc_encode_base(enc, bs, surf->mapped_res, frame->pts, packet, received_packet);
  271. }