nvenc-cuda.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. #include "nvenc-internal.h"
  2. #include "nvenc-helpers.h"
  3. /*
  4. * NVENC implementation using CUDA context and arrays
  5. */
  6. /* ------------------------------------------------------------------------- */
  7. /* CUDA Context management */
  8. bool cuda_ctx_init(struct nvenc_data *enc, obs_data_t *settings,
  9. const bool texture)
  10. {
  11. #ifdef _WIN32
  12. if (texture)
  13. return true;
  14. #endif
  15. int count;
  16. CUdevice device;
  17. int gpu = (int)obs_data_get_int(settings, "device");
  18. #ifndef _WIN32
  19. /* CUDA can do fairly efficient cross-GPU OpenGL mappings, allow it as
  20. * a hidden option for experimentation. */
  21. bool force_cuda_tex = obs_data_get_bool(settings, "force_cuda_tex");
  22. #endif
  23. if (gpu == -1)
  24. gpu = 0;
  25. CU_FAILED(cu->cuInit(0))
  26. CU_FAILED(cu->cuDeviceGetCount(&count))
  27. if (!count) {
  28. NV_FAIL("No CUDA devices found");
  29. return false;
  30. }
  31. #ifdef _WIN32
  32. CU_FAILED(cu->cuDeviceGet(&device, gpu))
  33. #else
  34. if (!texture || force_cuda_tex) {
  35. CU_FAILED(cu->cuDeviceGet(&device, gpu))
  36. } else {
  37. unsigned int ctx_count = 0;
  38. CUdevice devices[2];
  39. obs_enter_graphics();
  40. CUresult res = cu->cuGLGetDevices(&ctx_count, devices, 2,
  41. CU_GL_DEVICE_LIST_ALL);
  42. obs_leave_graphics();
  43. if (res != CUDA_SUCCESS || !ctx_count) {
  44. /* Probably running on iGPU, should just fall back to
  45. * non-texture encoder. */
  46. if (res == CUDA_ERROR_INVALID_GRAPHICS_CONTEXT) {
  47. info("Not running on NVIDIA GPU, falling back "
  48. "to non-texture encoder");
  49. } else {
  50. const char *name, *desc;
  51. if (cuda_get_error_desc(res, &name, &desc)) {
  52. error("Failed to get a CUDA device for "
  53. "the current OpenGL context: "
  54. "%s: %s",
  55. name, desc);
  56. } else {
  57. error("Failed to get a CUDA device for "
  58. "the current OpenGL context: %d",
  59. res);
  60. }
  61. }
  62. return false;
  63. }
  64. /* Documentation indicates this should only ever happen with
  65. * SLI, i.e. never for OBS. */
  66. if (ctx_count > 1) {
  67. warn("Got more than one CUDA devices for OpenGL context,"
  68. " this is untested.");
  69. }
  70. device = devices[0];
  71. debug("Loading up CUDA on device %u", device);
  72. }
  73. #endif
  74. CU_FAILED(cu->cuCtxCreate(&enc->cu_ctx, 0, device))
  75. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  76. return true;
  77. }
  78. void cuda_ctx_free(struct nvenc_data *enc)
  79. {
  80. if (enc->cu_ctx) {
  81. cu->cuCtxPopCurrent(NULL);
  82. cu->cuCtxDestroy(enc->cu_ctx);
  83. }
  84. }
  85. /* ------------------------------------------------------------------------- */
  86. /* CUDA Surface management */
  87. static bool cuda_surface_init(struct nvenc_data *enc,
  88. struct nv_cuda_surface *nvsurf)
  89. {
  90. const bool p010 = obs_p010_tex_active();
  91. CUDA_ARRAY3D_DESCRIPTOR desc;
  92. desc.Width = enc->cx;
  93. desc.Height = enc->cy;
  94. desc.Depth = 0;
  95. desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
  96. desc.NumChannels = 1;
  97. if (!enc->non_texture) {
  98. desc.Format = p010 ? CU_AD_FORMAT_UNSIGNED_INT16
  99. : CU_AD_FORMAT_UNSIGNED_INT8;
  100. desc.Height = enc->cy + enc->cy / 2;
  101. } else {
  102. switch (enc->surface_format) {
  103. case NV_ENC_BUFFER_FORMAT_NV12:
  104. desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
  105. // Additional half-height plane for UV data
  106. desc.Height += enc->cy / 2;
  107. break;
  108. case NV_ENC_BUFFER_FORMAT_YUV420_10BIT:
  109. desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
  110. desc.Height += enc->cy / 2;
  111. desc.NumChannels = 2; // number of bytes per element
  112. break;
  113. case NV_ENC_BUFFER_FORMAT_YUV444:
  114. desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
  115. desc.Height *= 3; // 3 full-size planes
  116. break;
  117. default:
  118. error("Unknown input format: %d", enc->surface_format);
  119. return false;
  120. }
  121. }
  122. CU_FAILED(cu->cuArray3DCreate(&nvsurf->tex, &desc))
  123. NV_ENC_REGISTER_RESOURCE res = {0};
  124. res.version = NV_ENC_REGISTER_RESOURCE_VER;
  125. res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDAARRAY;
  126. res.resourceToRegister = (void *)nvsurf->tex;
  127. res.width = enc->cx;
  128. res.height = enc->cy;
  129. res.pitch = (uint32_t)(desc.Width * desc.NumChannels);
  130. if (!enc->non_texture) {
  131. res.bufferFormat = p010 ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT
  132. : NV_ENC_BUFFER_FORMAT_NV12;
  133. } else {
  134. res.bufferFormat = enc->surface_format;
  135. }
  136. if (NV_FAILED(nv.nvEncRegisterResource(enc->session, &res))) {
  137. return false;
  138. }
  139. nvsurf->res = res.registeredResource;
  140. nvsurf->mapped_res = NULL;
  141. return true;
  142. }
  143. bool cuda_init_surfaces(struct nvenc_data *enc)
  144. {
  145. switch (enc->in_format) {
  146. case VIDEO_FORMAT_P010:
  147. enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
  148. break;
  149. case VIDEO_FORMAT_I444:
  150. enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV444;
  151. break;
  152. default:
  153. enc->surface_format = NV_ENC_BUFFER_FORMAT_NV12;
  154. }
  155. da_reserve(enc->surfaces, enc->buf_count);
  156. CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
  157. for (uint32_t i = 0; i < enc->buf_count; i++) {
  158. struct nv_cuda_surface buf;
  159. if (!cuda_surface_init(enc, &buf)) {
  160. return false;
  161. }
  162. da_push_back(enc->surfaces, &buf);
  163. }
  164. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  165. return true;
  166. }
  167. static void cuda_surface_free(struct nvenc_data *enc,
  168. struct nv_cuda_surface *nvsurf)
  169. {
  170. if (nvsurf->res) {
  171. if (nvsurf->mapped_res) {
  172. nv.nvEncUnmapInputResource(enc->session,
  173. nvsurf->mapped_res);
  174. }
  175. nv.nvEncUnregisterResource(enc->session, nvsurf->res);
  176. cu->cuArrayDestroy(nvsurf->tex);
  177. }
  178. }
  179. void cuda_free_surfaces(struct nvenc_data *enc)
  180. {
  181. if (!enc->cu_ctx)
  182. return;
  183. cu->cuCtxPushCurrent(enc->cu_ctx);
  184. for (size_t i = 0; i < enc->surfaces.num; i++) {
  185. cuda_surface_free(enc, &enc->surfaces.array[i]);
  186. }
  187. cu->cuCtxPopCurrent(NULL);
  188. }
  189. /* ------------------------------------------------------------------------- */
  190. /* Actual encoding stuff */
  191. static inline bool copy_frame(struct nvenc_data *enc,
  192. struct encoder_frame *frame,
  193. struct nv_cuda_surface *surf)
  194. {
  195. bool success = true;
  196. size_t height = enc->cy;
  197. size_t width = enc->cx;
  198. CUDA_MEMCPY2D m = {0};
  199. m.srcMemoryType = CU_MEMORYTYPE_HOST;
  200. m.dstMemoryType = CU_MEMORYTYPE_ARRAY;
  201. m.dstArray = surf->tex;
  202. m.WidthInBytes = width;
  203. m.Height = height;
  204. CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
  205. if (enc->surface_format == NV_ENC_BUFFER_FORMAT_NV12) {
  206. /* Page-locks the host memory so that it can be DMAd directly
  207. * rather than CUDA doing an internal copy to page-locked
  208. * memory before actually DMA-ing to the GPU. */
  209. CU_CHECK(cu->cuMemHostRegister(frame->data[0],
  210. frame->linesize[0] * height, 0))
  211. CU_CHECK(cu->cuMemHostRegister(
  212. frame->data[1], frame->linesize[1] * height / 2, 0))
  213. m.srcPitch = frame->linesize[0];
  214. m.srcHost = frame->data[0];
  215. CU_FAILED(cu->cuMemcpy2D(&m))
  216. m.srcPitch = frame->linesize[1];
  217. m.srcHost = frame->data[1];
  218. m.dstY += height;
  219. m.Height /= 2;
  220. CU_FAILED(cu->cuMemcpy2D(&m))
  221. } else if (enc->surface_format == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) {
  222. CU_CHECK(cu->cuMemHostRegister(frame->data[0],
  223. frame->linesize[0] * height, 0))
  224. CU_CHECK(cu->cuMemHostRegister(
  225. frame->data[1], frame->linesize[1] * height / 2, 0))
  226. // P010 lines are double the size (16 bit per pixel)
  227. m.WidthInBytes *= 2;
  228. m.srcPitch = frame->linesize[0];
  229. m.srcHost = frame->data[0];
  230. CU_FAILED(cu->cuMemcpy2D(&m))
  231. m.srcPitch = frame->linesize[1];
  232. m.srcHost = frame->data[1];
  233. m.dstY += height;
  234. m.Height /= 2;
  235. CU_FAILED(cu->cuMemcpy2D(&m))
  236. } else { // I444
  237. CU_CHECK(cu->cuMemHostRegister(frame->data[0],
  238. frame->linesize[0] * height, 0))
  239. CU_CHECK(cu->cuMemHostRegister(frame->data[1],
  240. frame->linesize[1] * height, 0))
  241. CU_CHECK(cu->cuMemHostRegister(frame->data[2],
  242. frame->linesize[2] * height, 0))
  243. m.srcPitch = frame->linesize[0];
  244. m.srcHost = frame->data[0];
  245. CU_FAILED(cu->cuMemcpy2D(&m))
  246. m.srcPitch = frame->linesize[1];
  247. m.srcHost = frame->data[1];
  248. m.dstY += height;
  249. CU_FAILED(cu->cuMemcpy2D(&m))
  250. m.srcPitch = frame->linesize[2];
  251. m.srcHost = frame->data[2];
  252. m.dstY += height;
  253. CU_FAILED(cu->cuMemcpy2D(&m))
  254. }
  255. unmap:
  256. if (frame->data[0])
  257. cu->cuMemHostUnregister(frame->data[0]);
  258. if (frame->data[1])
  259. cu->cuMemHostUnregister(frame->data[1]);
  260. if (frame->data[2])
  261. cu->cuMemHostUnregister(frame->data[2]);
  262. CU_FAILED(cu->cuCtxPopCurrent(NULL))
  263. return success;
  264. }
  265. bool cuda_encode(void *data, struct encoder_frame *frame,
  266. struct encoder_packet *packet, bool *received_packet)
  267. {
  268. struct nvenc_data *enc = data;
  269. struct nv_cuda_surface *surf;
  270. struct nv_bitstream *bs;
  271. bs = &enc->bitstreams.array[enc->next_bitstream];
  272. surf = &enc->surfaces.array[enc->next_bitstream];
  273. deque_push_back(&enc->dts_list, &frame->pts, sizeof(frame->pts));
  274. /* ------------------------------------ */
  275. /* copy to CUDA surface */
  276. if (!copy_frame(enc, frame, surf))
  277. return false;
  278. /* ------------------------------------ */
  279. /* map output tex so nvenc can use it */
  280. NV_ENC_MAP_INPUT_RESOURCE map = {NV_ENC_MAP_INPUT_RESOURCE_VER};
  281. map.registeredResource = surf->res;
  282. map.mappedBufferFmt = enc->surface_format;
  283. if (NV_FAILED(nv.nvEncMapInputResource(enc->session, &map)))
  284. return false;
  285. surf->mapped_res = map.mappedResource;
  286. /* ------------------------------------ */
  287. /* do actual encode call */
  288. return nvenc_encode_base(enc, bs, surf->mapped_res, frame->pts, packet,
  289. received_packet);
  290. }