source-profiler.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. /******************************************************************************
  2. Copyright (C) 2023 by Dennis Sädtler <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "source-profiler.h"
  15. #include "darray.h"
  16. #include "obs-internal.h"
  17. #include "platform.h"
  18. #include "threading.h"
  19. #include "uthash.h"
  20. struct frame_sample {
  21. uint64_t tick;
  22. DARRAY(uint64_t) render_cpu;
  23. DARRAY(gs_timer_t *) render_timers;
  24. };
  25. /* Buffer frame data collection to give GPU time to finish rendering.
  26. * Set to the same as the rendering buffer (NUM_TEXTURES) */
  27. #define FRAME_BUFFER_SIZE NUM_TEXTURES
  28. struct source_samples {
  29. /* the pointer address of the source is the hashtable key */
  30. uintptr_t key;
  31. uint8_t frame_idx;
  32. struct frame_sample *frames[FRAME_BUFFER_SIZE];
  33. UT_hash_handle hh;
  34. };
  35. /* Basic fixed-size circular buffer to hold most recent N uint64_t values
  36. * (older items will be overwritten). */
  37. struct ucirclebuf {
  38. size_t idx;
  39. size_t capacity;
  40. size_t num;
  41. uint64_t *array;
  42. };
  43. struct profiler_entry {
  44. /* the pointer address of the source is the hashtable key */
  45. uintptr_t key;
  46. /* Tick times for last N frames */
  47. struct ucirclebuf tick;
  48. /* Time of first render pass in a frame, for last N frames */
  49. struct ucirclebuf render_cpu;
  50. struct ucirclebuf render_gpu;
  51. /* Sum of all render passes in a frame, for last N frames */
  52. struct ucirclebuf render_cpu_sum;
  53. struct ucirclebuf render_gpu_sum;
  54. /* Timestamps of last N async frame submissions */
  55. struct ucirclebuf async_frame_ts;
  56. /* Timestamps of last N async frames rendered */
  57. struct ucirclebuf async_rendered_ts;
  58. UT_hash_handle hh;
  59. };
  60. /* Hashmaps */
  61. struct source_samples *hm_samples = NULL;
  62. struct profiler_entry *hm_entries = NULL;
  63. /* GPU timer ranges (only required for DirectX) */
  64. static uint8_t timer_idx = 0;
  65. static gs_timer_range_t *timer_ranges[FRAME_BUFFER_SIZE] = {0};
  66. static uint64_t profiler_samples = 0;
  67. /* Sources can be rendered more than once per frame, to avoid reallocating
  68. * memory in the majority of cases, reserve at least two. */
  69. static const size_t render_times_reservation = 2;
  70. pthread_rwlock_t hm_rwlock = PTHREAD_RWLOCK_INITIALIZER;
  71. static bool enabled = false;
  72. static bool gpu_enabled = false;
  73. /* These can be set from other threads, mark them volatile */
  74. static volatile bool enable_next = false;
  75. static volatile bool gpu_enable_next = false;
  76. void ucirclebuf_init(struct ucirclebuf *buf, size_t capacity)
  77. {
  78. if (!capacity)
  79. return;
  80. memset(buf, 0, sizeof(struct ucirclebuf));
  81. buf->capacity = capacity;
  82. buf->array = bmalloc(sizeof(uint64_t) * capacity);
  83. }
  84. void ucirclebuf_free(struct ucirclebuf *buf)
  85. {
  86. bfree(buf->array);
  87. memset(buf, 0, sizeof(struct ucirclebuf));
  88. }
  89. void ucirclebuf_push(struct ucirclebuf *buf, uint64_t val)
  90. {
  91. if (buf->num == buf->capacity) {
  92. buf->idx %= buf->capacity;
  93. buf->array[buf->idx++] = val;
  94. return;
  95. }
  96. buf->array[buf->idx++] = val;
  97. buf->num++;
  98. }
  99. static struct frame_sample *frame_sample_create(void)
  100. {
  101. struct frame_sample *smp = bzalloc(sizeof(struct frame_sample));
  102. da_reserve(smp->render_cpu, render_times_reservation);
  103. da_reserve(smp->render_timers, render_times_reservation);
  104. return smp;
  105. }
  106. static void frame_sample_destroy(struct frame_sample *sample)
  107. {
  108. if (sample->render_timers.num) {
  109. gs_enter_context(obs->video.graphics);
  110. for (size_t i = 0; i < sample->render_timers.num; i++)
  111. gs_timer_destroy(sample->render_timers.array[i]);
  112. gs_leave_context();
  113. }
  114. da_free(sample->render_cpu);
  115. da_free(sample->render_timers);
  116. bfree(sample);
  117. }
  118. struct source_samples *source_samples_create(const uintptr_t key)
  119. {
  120. struct source_samples *smps = bzalloc(sizeof(struct source_samples));
  121. smps->key = key;
  122. for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
  123. smps->frames[i] = frame_sample_create();
  124. return smps;
  125. }
  126. static void source_samples_destroy(struct source_samples *sample)
  127. {
  128. for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
  129. frame_sample_destroy(sample->frames[i]);
  130. bfree(sample);
  131. }
  132. static struct profiler_entry *entry_create(const uintptr_t key)
  133. {
  134. struct profiler_entry *ent = bzalloc(sizeof(struct profiler_entry));
  135. ent->key = key;
  136. ucirclebuf_init(&ent->tick, profiler_samples);
  137. ucirclebuf_init(&ent->render_cpu, profiler_samples);
  138. ucirclebuf_init(&ent->render_gpu, profiler_samples);
  139. ucirclebuf_init(&ent->render_cpu_sum, profiler_samples);
  140. ucirclebuf_init(&ent->render_gpu_sum, profiler_samples);
  141. ucirclebuf_init(&ent->async_frame_ts, profiler_samples);
  142. ucirclebuf_init(&ent->async_rendered_ts, profiler_samples);
  143. return ent;
  144. }
  145. static void entry_destroy(struct profiler_entry *entry)
  146. {
  147. ucirclebuf_free(&entry->tick);
  148. ucirclebuf_free(&entry->render_cpu);
  149. ucirclebuf_free(&entry->render_gpu);
  150. ucirclebuf_free(&entry->render_cpu_sum);
  151. ucirclebuf_free(&entry->render_gpu_sum);
  152. ucirclebuf_free(&entry->async_frame_ts);
  153. ucirclebuf_free(&entry->async_rendered_ts);
  154. bfree(entry);
  155. }
  156. static void reset_gpu_timers(void)
  157. {
  158. gs_enter_context(obs->video.graphics);
  159. for (int i = 0; i < FRAME_BUFFER_SIZE; i++) {
  160. if (timer_ranges[i]) {
  161. gs_timer_range_destroy(timer_ranges[i]);
  162. timer_ranges[i] = NULL;
  163. }
  164. }
  165. gs_leave_context();
  166. }
  167. static void profiler_shutdown(void)
  168. {
  169. struct source_samples *smp, *tmp;
  170. HASH_ITER (hh, hm_samples, smp, tmp) {
  171. HASH_DEL(hm_samples, smp);
  172. source_samples_destroy(smp);
  173. }
  174. pthread_rwlock_wrlock(&hm_rwlock);
  175. struct profiler_entry *ent, *etmp;
  176. HASH_ITER (hh, hm_entries, ent, etmp) {
  177. HASH_DEL(hm_entries, ent);
  178. entry_destroy(ent);
  179. }
  180. pthread_rwlock_unlock(&hm_rwlock);
  181. reset_gpu_timers();
  182. }
  183. void source_profiler_enable(bool enable)
  184. {
  185. enable_next = enable;
  186. }
  187. void source_profiler_gpu_enable(bool enable)
  188. {
  189. gpu_enable_next = enable && enable_next;
  190. }
  191. void source_profiler_reset_video(struct obs_video_info *ovi)
  192. {
  193. double fps = ceil((double)ovi->fps_num / (double)ovi->fps_den);
  194. profiler_samples = (uint64_t)(fps * 5);
  195. /* This is fine because the video thread won't be running at this point */
  196. profiler_shutdown();
  197. }
  198. void source_profiler_render_begin(void)
  199. {
  200. if (!gpu_enabled)
  201. return;
  202. gs_enter_context(obs->video.graphics);
  203. if (!timer_ranges[timer_idx])
  204. timer_ranges[timer_idx] = gs_timer_range_create();
  205. gs_timer_range_begin(timer_ranges[timer_idx]);
  206. gs_leave_context();
  207. }
  208. void source_profiler_render_end(void)
  209. {
  210. if (!gpu_enabled || !timer_ranges[timer_idx])
  211. return;
  212. gs_enter_context(obs->video.graphics);
  213. gs_timer_range_end(timer_ranges[timer_idx]);
  214. gs_leave_context();
  215. }
  216. void source_profiler_frame_begin(void)
  217. {
  218. if (!enabled && enable_next)
  219. enabled = true;
  220. if (!gpu_enabled && enabled && gpu_enable_next) {
  221. gpu_enabled = true;
  222. } else if (gpu_enabled) {
  223. /* Advance timer idx if gpu enabled */
  224. timer_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
  225. }
  226. }
  227. static inline bool is_async_video_source(const struct obs_source *source)
  228. {
  229. return (source->info.output_flags & OBS_SOURCE_ASYNC_VIDEO) ==
  230. OBS_SOURCE_ASYNC_VIDEO;
  231. }
  232. static const char *source_profiler_frame_collect_name =
  233. "source_profiler_frame_collect";
  234. void source_profiler_frame_collect(void)
  235. {
  236. if (!enabled)
  237. return;
  238. profile_start(source_profiler_frame_collect_name);
  239. bool gpu_disjoint = false;
  240. bool gpu_ready = false;
  241. uint64_t freq = 0;
  242. if (gpu_enabled) {
  243. uint8_t timer_range_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
  244. if (timer_ranges[timer_range_idx]) {
  245. gpu_ready = true;
  246. gs_enter_context(obs->video.graphics);
  247. gs_timer_range_get_data(timer_ranges[timer_range_idx],
  248. &gpu_disjoint, &freq);
  249. }
  250. if (gpu_disjoint) {
  251. blog(LOG_WARNING,
  252. "GPU Timers were disjoint, discarding samples.");
  253. }
  254. }
  255. pthread_rwlock_wrlock(&hm_rwlock);
  256. struct source_samples *smps = hm_samples;
  257. while (smps) {
  258. /* processing is delayed by FRAME_BUFFER_SIZE - 1 frames */
  259. uint8_t frame_idx = (smps->frame_idx + 1) % FRAME_BUFFER_SIZE;
  260. struct frame_sample *smp = smps->frames[frame_idx];
  261. if (!smp->tick) {
  262. /* No data yet */
  263. smps = smps->hh.next;
  264. continue;
  265. }
  266. struct profiler_entry *ent;
  267. HASH_FIND_PTR(hm_entries, &smps->key, ent);
  268. if (!ent) {
  269. ent = entry_create(smps->key);
  270. HASH_ADD_PTR(hm_entries, key, ent);
  271. }
  272. ucirclebuf_push(&ent->tick, smp->tick);
  273. if (smp->render_cpu.num) {
  274. uint64_t sum = 0;
  275. for (size_t idx = 0; idx < smp->render_cpu.num; idx++) {
  276. sum += smp->render_cpu.array[idx];
  277. }
  278. ucirclebuf_push(&ent->render_cpu,
  279. smp->render_cpu.array[0]);
  280. ucirclebuf_push(&ent->render_cpu_sum, sum);
  281. da_clear(smp->render_cpu);
  282. }
  283. /* Note that we still check this even if GPU profiling has been
  284. * disabled to destroy leftover timers. */
  285. if (smp->render_timers.num) {
  286. uint64_t sum = 0, first = 0, ticks = 0;
  287. for (size_t i = 0; i < smp->render_timers.num; i++) {
  288. gs_timer_t *timer = smp->render_timers.array[i];
  289. if (gpu_ready && !gpu_disjoint &&
  290. gs_timer_get_data(timer, &ticks)) {
  291. /* Convert ticks to ns */
  292. sum += util_mul_div64(
  293. ticks, 1000000000ULL, freq);
  294. if (!first)
  295. first = sum;
  296. }
  297. gs_timer_destroy(timer);
  298. }
  299. if (first) {
  300. ucirclebuf_push(&ent->render_gpu, first);
  301. ucirclebuf_push(&ent->render_gpu_sum, sum);
  302. }
  303. da_clear(smp->render_timers);
  304. }
  305. const obs_source_t *src = *(const obs_source_t **)smps->hh.key;
  306. if (is_async_video_source(src)) {
  307. uint64_t ts = obs_source_get_last_async_ts(src);
  308. ucirclebuf_push(&ent->async_rendered_ts, ts);
  309. }
  310. smps = smps->hh.next;
  311. }
  312. pthread_rwlock_unlock(&hm_rwlock);
  313. if (gpu_enabled && gpu_ready)
  314. gs_leave_context();
  315. /* Apply updated states for next frame */
  316. if (!enable_next) {
  317. enabled = gpu_enabled = false;
  318. profiler_shutdown();
  319. } else if (!gpu_enable_next) {
  320. gpu_enabled = false;
  321. reset_gpu_timers();
  322. }
  323. profile_end(source_profiler_frame_collect_name);
  324. }
  325. void source_profiler_async_frame_received(obs_source_t *source)
  326. {
  327. if (!enabled)
  328. return;
  329. uint64_t ts = os_gettime_ns();
  330. pthread_rwlock_wrlock(&hm_rwlock);
  331. struct profiler_entry *ent;
  332. HASH_FIND_PTR(hm_entries, &source, ent);
  333. if (ent)
  334. ucirclebuf_push(&ent->async_frame_ts, ts);
  335. pthread_rwlock_unlock(&hm_rwlock);
  336. }
  337. uint64_t source_profiler_source_tick_start(void)
  338. {
  339. if (!enabled)
  340. return 0;
  341. return os_gettime_ns();
  342. }
  343. void source_profiler_source_tick_end(obs_source_t *source, uint64_t start)
  344. {
  345. if (!enabled)
  346. return;
  347. const uint64_t delta = os_gettime_ns() - start;
  348. struct source_samples *smp = NULL;
  349. HASH_FIND_PTR(hm_samples, &source, smp);
  350. if (!smp) {
  351. smp = source_samples_create((uintptr_t)source);
  352. HASH_ADD_PTR(hm_samples, key, smp);
  353. } else {
  354. /* Advance index here since tick happens first and only once
  355. * at the start of each frame. */
  356. smp->frame_idx = (smp->frame_idx + 1) % FRAME_BUFFER_SIZE;
  357. }
  358. smp->frames[smp->frame_idx]->tick = delta;
  359. }
  360. uint64_t source_profiler_source_render_begin(gs_timer_t **timer)
  361. {
  362. if (!enabled)
  363. return 0;
  364. if (gpu_enabled) {
  365. *timer = gs_timer_create();
  366. gs_timer_begin(*timer);
  367. } else {
  368. *timer = NULL;
  369. }
  370. return os_gettime_ns();
  371. }
  372. void source_profiler_source_render_end(obs_source_t *source, uint64_t start,
  373. gs_timer_t *timer)
  374. {
  375. if (!enabled)
  376. return;
  377. if (timer)
  378. gs_timer_end(timer);
  379. const uint64_t delta = os_gettime_ns() - start;
  380. struct source_samples *smp;
  381. HASH_FIND_PTR(hm_samples, &source, smp);
  382. if (smp) {
  383. da_push_back(smp->frames[smp->frame_idx]->render_cpu, &delta);
  384. if (timer) {
  385. da_push_back(smp->frames[smp->frame_idx]->render_timers,
  386. &timer);
  387. }
  388. } else if (timer) {
  389. gs_timer_destroy(timer);
  390. }
  391. }
  392. static void task_delete_source(void *key)
  393. {
  394. struct source_samples *smp;
  395. HASH_FIND_PTR(hm_samples, &key, smp);
  396. if (smp) {
  397. HASH_DEL(hm_samples, smp);
  398. source_samples_destroy(smp);
  399. }
  400. pthread_rwlock_rdlock(&hm_rwlock);
  401. struct profiler_entry *ent = NULL;
  402. HASH_FIND_PTR(hm_entries, &key, ent);
  403. if (ent) {
  404. HASH_DEL(hm_entries, ent);
  405. entry_destroy(ent);
  406. }
  407. pthread_rwlock_unlock(&hm_rwlock);
  408. }
  409. void source_profiler_remove_source(obs_source_t *source)
  410. {
  411. if (!enabled)
  412. return;
  413. /* Schedule deletion task on graphics thread */
  414. obs_queue_task(OBS_TASK_GRAPHICS, task_delete_source, source, false);
  415. }
  416. static inline void calculate_tick(struct profiler_entry *ent,
  417. struct profiler_result *result)
  418. {
  419. size_t idx = 0;
  420. uint64_t sum = 0;
  421. for (; idx < ent->tick.num; idx++) {
  422. const uint64_t delta = ent->tick.array[idx];
  423. if (delta > result->tick_max)
  424. result->tick_max = delta;
  425. sum += delta;
  426. }
  427. if (idx)
  428. result->tick_avg = sum / idx;
  429. }
  430. static inline void calculate_render(struct profiler_entry *ent,
  431. struct profiler_result *result)
  432. {
  433. size_t idx;
  434. uint64_t sum = 0, sum_sum = 0;
  435. for (idx = 0; idx < ent->render_cpu.num; idx++) {
  436. const uint64_t delta = ent->render_cpu.array[idx];
  437. if (delta > result->render_max)
  438. result->render_max = delta;
  439. sum += delta;
  440. sum_sum += ent->render_cpu_sum.array[idx];
  441. }
  442. if (idx) {
  443. result->render_avg = sum / idx;
  444. result->render_sum = sum_sum / idx;
  445. }
  446. if (!gpu_enabled)
  447. return;
  448. sum = sum_sum = 0;
  449. for (idx = 0; idx < ent->render_gpu.num; idx++) {
  450. const uint64_t delta = ent->render_gpu.array[idx];
  451. if (delta > result->render_gpu_max)
  452. result->render_gpu_max = delta;
  453. sum += delta;
  454. sum_sum += ent->render_gpu_sum.array[idx];
  455. }
  456. if (idx) {
  457. result->render_gpu_avg = sum / idx;
  458. result->render_gpu_sum = sum_sum / idx;
  459. }
  460. }
  461. static inline void calculate_fps(const struct ucirclebuf *frames, double *avg,
  462. uint64_t *best, uint64_t *worst)
  463. {
  464. uint64_t deltas = 0, delta_sum = 0, best_delta = 0, worst_delta = 0;
  465. for (size_t idx = 0; idx < frames->num; idx++) {
  466. const uint64_t ts = frames->array[idx];
  467. if (!ts)
  468. break;
  469. size_t prev_idx = idx ? idx - 1 : frames->num - 1;
  470. const uint64_t prev_ts = frames->array[prev_idx];
  471. if (!prev_ts || prev_ts >= ts)
  472. continue;
  473. uint64_t delta = (ts - prev_ts);
  474. if (delta < best_delta || !best_delta)
  475. best_delta = delta;
  476. if (delta > worst_delta)
  477. worst_delta = delta;
  478. delta_sum += delta;
  479. deltas++;
  480. }
  481. if (deltas && delta_sum) {
  482. *avg = 1.0E9 / ((double)delta_sum / (double)deltas);
  483. *best = best_delta;
  484. *worst = worst_delta;
  485. }
  486. }
  487. bool source_profiler_fill_result(obs_source_t *source,
  488. struct profiler_result *result)
  489. {
  490. if (!enabled || !result)
  491. return false;
  492. memset(result, 0, sizeof(struct profiler_result));
  493. /* No or only stale data available */
  494. if (!obs_source_enabled(source))
  495. return true;
  496. pthread_rwlock_rdlock(&hm_rwlock);
  497. struct profiler_entry *ent = NULL;
  498. HASH_FIND_PTR(hm_entries, &source, ent);
  499. if (ent) {
  500. calculate_tick(ent, result);
  501. calculate_render(ent, result);
  502. if (is_async_video_source(source)) {
  503. calculate_fps(&ent->async_frame_ts,
  504. &result->async_input,
  505. &result->async_input_best,
  506. &result->async_input_worst);
  507. calculate_fps(&ent->async_rendered_ts,
  508. &result->async_rendered,
  509. &result->async_rendered_best,
  510. &result->async_rendered_worst);
  511. }
  512. }
  513. pthread_rwlock_unlock(&hm_rwlock);
  514. return !!ent;
  515. }
  516. profiler_result_t *source_profiler_get_result(obs_source_t *source)
  517. {
  518. profiler_result_t *ret = bmalloc(sizeof(profiler_result_t));
  519. if (!source_profiler_fill_result(source, ret)) {
  520. bfree(ret);
  521. return NULL;
  522. }
  523. return ret;
  524. }