123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630 |
- /******************************************************************************
- Copyright (C) 2023 by Dennis Sädtler <[email protected]>
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- ******************************************************************************/
- #include "source-profiler.h"
- #include "darray.h"
- #include "obs-internal.h"
- #include "platform.h"
- #include "threading.h"
- #include "uthash.h"
- struct frame_sample {
- uint64_t tick;
- DARRAY(uint64_t) render_cpu;
- DARRAY(gs_timer_t *) render_timers;
- };
- /* Buffer frame data collection to give GPU time to finish rendering.
- * Set to the same as the rendering buffer (NUM_TEXTURES) */
- #define FRAME_BUFFER_SIZE NUM_TEXTURES
- struct source_samples {
- /* the pointer address of the source is the hashtable key */
- uintptr_t key;
- uint8_t frame_idx;
- struct frame_sample *frames[FRAME_BUFFER_SIZE];
- UT_hash_handle hh;
- };
- /* Basic fixed-size circular buffer to hold most recent N uint64_t values
- * (older items will be overwritten). */
- struct ucirclebuf {
- size_t idx;
- size_t capacity;
- size_t num;
- uint64_t *array;
- };
- struct profiler_entry {
- /* the pointer address of the source is the hashtable key */
- uintptr_t key;
- /* Tick times for last N frames */
- struct ucirclebuf tick;
- /* Time of first render pass in a frame, for last N frames */
- struct ucirclebuf render_cpu;
- struct ucirclebuf render_gpu;
- /* Sum of all render passes in a frame, for last N frames */
- struct ucirclebuf render_cpu_sum;
- struct ucirclebuf render_gpu_sum;
- /* Timestamps of last N async frame submissions */
- struct ucirclebuf async_frame_ts;
- /* Timestamps of last N async frames rendered */
- struct ucirclebuf async_rendered_ts;
- UT_hash_handle hh;
- };
- /* Hashmaps */
- struct source_samples *hm_samples = NULL;
- struct profiler_entry *hm_entries = NULL;
- /* GPU timer ranges (only required for DirectX) */
- static uint8_t timer_idx = 0;
- static gs_timer_range_t *timer_ranges[FRAME_BUFFER_SIZE] = {0};
- static uint64_t profiler_samples = 0;
- /* Sources can be rendered more than once per frame, to avoid reallocating
- * memory in the majority of cases, reserve at least two. */
- static const size_t render_times_reservation = 2;
- pthread_rwlock_t hm_rwlock = PTHREAD_RWLOCK_INITIALIZER;
- static bool enabled = false;
- static bool gpu_enabled = false;
- /* These can be set from other threads, mark them volatile */
- static volatile bool enable_next = false;
- static volatile bool gpu_enable_next = false;
- void ucirclebuf_init(struct ucirclebuf *buf, size_t capacity)
- {
- if (!capacity)
- return;
- memset(buf, 0, sizeof(struct ucirclebuf));
- buf->capacity = capacity;
- buf->array = bmalloc(sizeof(uint64_t) * capacity);
- }
- void ucirclebuf_free(struct ucirclebuf *buf)
- {
- bfree(buf->array);
- memset(buf, 0, sizeof(struct ucirclebuf));
- }
- void ucirclebuf_push(struct ucirclebuf *buf, uint64_t val)
- {
- if (buf->num == buf->capacity) {
- buf->idx %= buf->capacity;
- buf->array[buf->idx++] = val;
- return;
- }
- buf->array[buf->idx++] = val;
- buf->num++;
- }
- static struct frame_sample *frame_sample_create(void)
- {
- struct frame_sample *smp = bzalloc(sizeof(struct frame_sample));
- da_reserve(smp->render_cpu, render_times_reservation);
- da_reserve(smp->render_timers, render_times_reservation);
- return smp;
- }
- static void frame_sample_destroy(struct frame_sample *sample)
- {
- if (sample->render_timers.num) {
- gs_enter_context(obs->video.graphics);
- for (size_t i = 0; i < sample->render_timers.num; i++)
- gs_timer_destroy(sample->render_timers.array[i]);
- gs_leave_context();
- }
- da_free(sample->render_cpu);
- da_free(sample->render_timers);
- bfree(sample);
- }
- struct source_samples *source_samples_create(const uintptr_t key)
- {
- struct source_samples *smps = bzalloc(sizeof(struct source_samples));
- smps->key = key;
- for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
- smps->frames[i] = frame_sample_create();
- return smps;
- }
- static void source_samples_destroy(struct source_samples *sample)
- {
- for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
- frame_sample_destroy(sample->frames[i]);
- bfree(sample);
- }
- static struct profiler_entry *entry_create(const uintptr_t key)
- {
- struct profiler_entry *ent = bzalloc(sizeof(struct profiler_entry));
- ent->key = key;
- ucirclebuf_init(&ent->tick, profiler_samples);
- ucirclebuf_init(&ent->render_cpu, profiler_samples);
- ucirclebuf_init(&ent->render_gpu, profiler_samples);
- ucirclebuf_init(&ent->render_cpu_sum, profiler_samples);
- ucirclebuf_init(&ent->render_gpu_sum, profiler_samples);
- ucirclebuf_init(&ent->async_frame_ts, profiler_samples);
- ucirclebuf_init(&ent->async_rendered_ts, profiler_samples);
- return ent;
- }
- static void entry_destroy(struct profiler_entry *entry)
- {
- ucirclebuf_free(&entry->tick);
- ucirclebuf_free(&entry->render_cpu);
- ucirclebuf_free(&entry->render_gpu);
- ucirclebuf_free(&entry->render_cpu_sum);
- ucirclebuf_free(&entry->render_gpu_sum);
- ucirclebuf_free(&entry->async_frame_ts);
- ucirclebuf_free(&entry->async_rendered_ts);
- bfree(entry);
- }
- static void reset_gpu_timers(void)
- {
- gs_enter_context(obs->video.graphics);
- for (int i = 0; i < FRAME_BUFFER_SIZE; i++) {
- if (timer_ranges[i]) {
- gs_timer_range_destroy(timer_ranges[i]);
- timer_ranges[i] = NULL;
- }
- }
- gs_leave_context();
- }
- static void profiler_shutdown(void)
- {
- struct source_samples *smp, *tmp;
- HASH_ITER (hh, hm_samples, smp, tmp) {
- HASH_DEL(hm_samples, smp);
- source_samples_destroy(smp);
- }
- pthread_rwlock_wrlock(&hm_rwlock);
- struct profiler_entry *ent, *etmp;
- HASH_ITER (hh, hm_entries, ent, etmp) {
- HASH_DEL(hm_entries, ent);
- entry_destroy(ent);
- }
- pthread_rwlock_unlock(&hm_rwlock);
- reset_gpu_timers();
- }
- void source_profiler_enable(bool enable)
- {
- enable_next = enable;
- }
- void source_profiler_gpu_enable(bool enable)
- {
- gpu_enable_next = enable && enable_next;
- }
- void source_profiler_reset_video(struct obs_video_info *ovi)
- {
- double fps = ceil((double)ovi->fps_num / (double)ovi->fps_den);
- profiler_samples = (uint64_t)(fps * 5);
- /* This is fine because the video thread won't be running at this point */
- profiler_shutdown();
- }
- void source_profiler_render_begin(void)
- {
- if (!gpu_enabled)
- return;
- gs_enter_context(obs->video.graphics);
- if (!timer_ranges[timer_idx])
- timer_ranges[timer_idx] = gs_timer_range_create();
- gs_timer_range_begin(timer_ranges[timer_idx]);
- gs_leave_context();
- }
- void source_profiler_render_end(void)
- {
- if (!gpu_enabled || !timer_ranges[timer_idx])
- return;
- gs_enter_context(obs->video.graphics);
- gs_timer_range_end(timer_ranges[timer_idx]);
- gs_leave_context();
- }
- void source_profiler_frame_begin(void)
- {
- if (!enabled && enable_next)
- enabled = true;
- if (!gpu_enabled && enabled && gpu_enable_next) {
- gpu_enabled = true;
- } else if (gpu_enabled) {
- /* Advance timer idx if gpu enabled */
- timer_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
- }
- }
- static inline bool is_async_video_source(const struct obs_source *source)
- {
- return (source->info.output_flags & OBS_SOURCE_ASYNC_VIDEO) == OBS_SOURCE_ASYNC_VIDEO;
- }
- static const char *source_profiler_frame_collect_name = "source_profiler_frame_collect";
- void source_profiler_frame_collect(void)
- {
- if (!enabled)
- return;
- profile_start(source_profiler_frame_collect_name);
- bool gpu_disjoint = false;
- bool gpu_ready = false;
- uint64_t freq = 0;
- if (gpu_enabled) {
- uint8_t timer_range_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
- if (timer_ranges[timer_range_idx]) {
- gpu_ready = true;
- gs_enter_context(obs->video.graphics);
- gs_timer_range_get_data(timer_ranges[timer_range_idx], &gpu_disjoint, &freq);
- }
- if (gpu_disjoint) {
- blog(LOG_WARNING, "GPU Timers were disjoint, discarding samples.");
- }
- }
- pthread_rwlock_wrlock(&hm_rwlock);
- struct source_samples *smps = hm_samples;
- while (smps) {
- /* processing is delayed by FRAME_BUFFER_SIZE - 1 frames */
- uint8_t frame_idx = (smps->frame_idx + 1) % FRAME_BUFFER_SIZE;
- struct frame_sample *smp = smps->frames[frame_idx];
- if (!smp->tick) {
- /* No data yet */
- smps = smps->hh.next;
- continue;
- }
- struct profiler_entry *ent;
- HASH_FIND_PTR(hm_entries, &smps->key, ent);
- if (!ent) {
- ent = entry_create(smps->key);
- HASH_ADD_PTR(hm_entries, key, ent);
- }
- ucirclebuf_push(&ent->tick, smp->tick);
- if (smp->render_cpu.num) {
- uint64_t sum = 0;
- for (size_t idx = 0; idx < smp->render_cpu.num; idx++) {
- sum += smp->render_cpu.array[idx];
- }
- ucirclebuf_push(&ent->render_cpu, smp->render_cpu.array[0]);
- ucirclebuf_push(&ent->render_cpu_sum, sum);
- da_clear(smp->render_cpu);
- } else {
- ucirclebuf_push(&ent->render_cpu, 0);
- ucirclebuf_push(&ent->render_cpu_sum, 0);
- }
- /* Note that we still check this even if GPU profiling has been
- * disabled to destroy leftover timers. */
- if (smp->render_timers.num) {
- uint64_t sum = 0, first = 0, ticks = 0;
- for (size_t i = 0; i < smp->render_timers.num; i++) {
- gs_timer_t *timer = smp->render_timers.array[i];
- if (gpu_ready && !gpu_disjoint && gs_timer_get_data(timer, &ticks)) {
- /* Convert ticks to ns */
- sum += util_mul_div64(ticks, 1000000000ULL, freq);
- if (!first)
- first = sum;
- }
- gs_timer_destroy(timer);
- }
- if (first) {
- ucirclebuf_push(&ent->render_gpu, first);
- ucirclebuf_push(&ent->render_gpu_sum, sum);
- }
- da_clear(smp->render_timers);
- } else {
- ucirclebuf_push(&ent->render_gpu, 0);
- ucirclebuf_push(&ent->render_gpu_sum, 0);
- }
- const obs_source_t *src = *(const obs_source_t **)smps->hh.key;
- if (is_async_video_source(src)) {
- uint64_t ts = obs_source_get_last_async_ts(src);
- ucirclebuf_push(&ent->async_rendered_ts, ts);
- }
- smps = smps->hh.next;
- }
- pthread_rwlock_unlock(&hm_rwlock);
- if (gpu_enabled && gpu_ready)
- gs_leave_context();
- /* Apply updated states for next frame */
- if (!enable_next) {
- enabled = gpu_enabled = false;
- profiler_shutdown();
- } else if (!gpu_enable_next) {
- gpu_enabled = false;
- reset_gpu_timers();
- }
- profile_end(source_profiler_frame_collect_name);
- }
- void source_profiler_async_frame_received(obs_source_t *source)
- {
- if (!enabled)
- return;
- uint64_t ts = os_gettime_ns();
- pthread_rwlock_wrlock(&hm_rwlock);
- struct profiler_entry *ent;
- HASH_FIND_PTR(hm_entries, &source, ent);
- if (ent)
- ucirclebuf_push(&ent->async_frame_ts, ts);
- pthread_rwlock_unlock(&hm_rwlock);
- }
- uint64_t source_profiler_source_tick_start(void)
- {
- if (!enabled)
- return 0;
- return os_gettime_ns();
- }
- void source_profiler_source_tick_end(obs_source_t *source, uint64_t start)
- {
- if (!enabled)
- return;
- const uint64_t delta = os_gettime_ns() - start;
- struct source_samples *smp = NULL;
- HASH_FIND_PTR(hm_samples, &source, smp);
- if (!smp) {
- smp = source_samples_create((uintptr_t)source);
- HASH_ADD_PTR(hm_samples, key, smp);
- } else {
- /* Advance index here since tick happens first and only once
- * at the start of each frame. */
- smp->frame_idx = (smp->frame_idx + 1) % FRAME_BUFFER_SIZE;
- }
- smp->frames[smp->frame_idx]->tick = delta;
- }
- uint64_t source_profiler_source_render_begin(gs_timer_t **timer)
- {
- if (!enabled)
- return 0;
- if (gpu_enabled) {
- *timer = gs_timer_create();
- gs_timer_begin(*timer);
- } else {
- *timer = NULL;
- }
- return os_gettime_ns();
- }
- void source_profiler_source_render_end(obs_source_t *source, uint64_t start, gs_timer_t *timer)
- {
- if (!enabled)
- return;
- if (timer)
- gs_timer_end(timer);
- const uint64_t delta = os_gettime_ns() - start;
- struct source_samples *smp;
- HASH_FIND_PTR(hm_samples, &source, smp);
- if (smp) {
- da_push_back(smp->frames[smp->frame_idx]->render_cpu, &delta);
- if (timer) {
- da_push_back(smp->frames[smp->frame_idx]->render_timers, &timer);
- }
- } else if (timer) {
- gs_timer_destroy(timer);
- }
- }
- static void task_delete_source(void *key)
- {
- struct source_samples *smp;
- HASH_FIND_PTR(hm_samples, &key, smp);
- if (smp) {
- HASH_DEL(hm_samples, smp);
- source_samples_destroy(smp);
- }
- pthread_rwlock_rdlock(&hm_rwlock);
- struct profiler_entry *ent = NULL;
- HASH_FIND_PTR(hm_entries, &key, ent);
- if (ent) {
- HASH_DEL(hm_entries, ent);
- entry_destroy(ent);
- }
- pthread_rwlock_unlock(&hm_rwlock);
- }
- void source_profiler_remove_source(obs_source_t *source)
- {
- if (!enabled)
- return;
- /* Schedule deletion task on graphics thread */
- obs_queue_task(OBS_TASK_GRAPHICS, task_delete_source, source, false);
- }
- static inline void calculate_tick(struct profiler_entry *ent, struct profiler_result *result)
- {
- size_t idx = 0;
- uint64_t sum = 0;
- for (; idx < ent->tick.num; idx++) {
- const uint64_t delta = ent->tick.array[idx];
- if (delta > result->tick_max)
- result->tick_max = delta;
- sum += delta;
- }
- if (idx)
- result->tick_avg = sum / idx;
- }
- static inline void calculate_render(struct profiler_entry *ent, struct profiler_result *result)
- {
- size_t idx;
- uint64_t sum = 0, sum_sum = 0;
- for (idx = 0; idx < ent->render_cpu.num; idx++) {
- const uint64_t delta = ent->render_cpu.array[idx];
- if (delta > result->render_max)
- result->render_max = delta;
- sum += delta;
- sum_sum += ent->render_cpu_sum.array[idx];
- }
- if (idx) {
- result->render_avg = sum / idx;
- result->render_sum = sum_sum / idx;
- }
- if (!gpu_enabled)
- return;
- sum = sum_sum = 0;
- for (idx = 0; idx < ent->render_gpu.num; idx++) {
- const uint64_t delta = ent->render_gpu.array[idx];
- if (delta > result->render_gpu_max)
- result->render_gpu_max = delta;
- sum += delta;
- sum_sum += ent->render_gpu_sum.array[idx];
- }
- if (idx) {
- result->render_gpu_avg = sum / idx;
- result->render_gpu_sum = sum_sum / idx;
- }
- }
- static inline void calculate_fps(const struct ucirclebuf *frames, double *avg, uint64_t *best, uint64_t *worst)
- {
- uint64_t deltas = 0, delta_sum = 0, best_delta = 0, worst_delta = 0;
- for (size_t idx = 0; idx < frames->num; idx++) {
- const uint64_t ts = frames->array[idx];
- if (!ts)
- break;
- size_t prev_idx = idx ? idx - 1 : frames->num - 1;
- const uint64_t prev_ts = frames->array[prev_idx];
- if (!prev_ts || prev_ts >= ts)
- continue;
- uint64_t delta = (ts - prev_ts);
- if (delta < best_delta || !best_delta)
- best_delta = delta;
- if (delta > worst_delta)
- worst_delta = delta;
- delta_sum += delta;
- deltas++;
- }
- if (deltas && delta_sum) {
- *avg = 1.0E9 / ((double)delta_sum / (double)deltas);
- *best = best_delta;
- *worst = worst_delta;
- }
- }
- bool source_profiler_fill_result(obs_source_t *source, struct profiler_result *result)
- {
- if (!enabled || !result)
- return false;
- memset(result, 0, sizeof(struct profiler_result));
- pthread_rwlock_rdlock(&hm_rwlock);
- struct profiler_entry *ent = NULL;
- HASH_FIND_PTR(hm_entries, &source, ent);
- if (ent) {
- calculate_tick(ent, result);
- calculate_render(ent, result);
- if (is_async_video_source(source)) {
- calculate_fps(&ent->async_frame_ts, &result->async_input, &result->async_input_best,
- &result->async_input_worst);
- calculate_fps(&ent->async_rendered_ts, &result->async_rendered, &result->async_rendered_best,
- &result->async_rendered_worst);
- }
- }
- pthread_rwlock_unlock(&hm_rwlock);
- return !!ent;
- }
- profiler_result_t *source_profiler_get_result(obs_source_t *source)
- {
- profiler_result_t *ret = bmalloc(sizeof(profiler_result_t));
- if (!source_profiler_fill_result(source, ret)) {
- bfree(ret);
- return NULL;
- }
- return ret;
- }
|