Browse Source

libobs: Add source profiler

Rodney 2 years ago
parent
commit
198581a475

+ 2 - 0
libobs/CMakeLists.txt

@@ -129,6 +129,8 @@ target_sources(
           util/profiler.h
           util/profiler.hpp
           util/serializer.h
+          util/source-profiler.c
+          util/source-profiler.h
           util/sse-intrin.h
           util/task.c
           util/task.h

+ 3 - 1
libobs/cmake/legacy.cmake

@@ -216,7 +216,9 @@ target_sources(
           util/util_uint128.h
           util/curl/curl-helper.h
           util/darray.h
-          util/util.hpp)
+          util/util.hpp
+          util/source-profiler.c
+          util/source-profiler.h)
 
 if(ENABLE_HEVC)
   target_sources(libobs PRIVATE obs-hevc.c obs-hevc.h)

+ 35 - 0
libobs/obs-internal.h

@@ -823,6 +823,7 @@ struct obs_source {
 	uint32_t async_cache_height;
 	uint32_t async_convert_width[MAX_AV_PLANES];
 	uint32_t async_convert_height[MAX_AV_PLANES];
+	uint64_t async_last_rendered_ts;
 
 	pthread_mutex_t caption_cb_mutex;
 	DARRAY(struct caption_cb_info) caption_cb_list;
@@ -1023,6 +1024,7 @@ extern void obs_source_deactivate(obs_source_t *source, enum view_type type);
 extern void obs_source_video_tick(obs_source_t *source, float seconds);
 extern float obs_source_get_target_volume(obs_source_t *source,
 					  obs_source_t *target);
+extern uint64_t obs_source_get_last_async_ts(const obs_source_t *source);
 
 extern void obs_source_audio_render(obs_source_t *source, uint32_t mixers,
 				    size_t channels, size_t sample_rate,
@@ -1400,3 +1402,36 @@ void obs_service_destroy(obs_service_t *service);
 
 void obs_output_remove_encoder_internal(struct obs_output *output,
 					struct obs_encoder *encoder);
+
+/** Internal Source Profiler functions **/
+
+/* Start of frame in graphics loop */
+extern void source_profiler_frame_begin(void);
+/* Process data collected during frame */
+extern void source_profiler_frame_collect(void);
+
+/* Start/end of outputs being rendered (GPU timer begin/end) */
+extern void source_profiler_render_begin(void);
+extern void source_profiler_render_end(void);
+
+/* Reset settings, buffers, and GPU timers when video settings change */
+extern void source_profiler_reset_video(struct obs_video_info *ovi);
+
+/* Signal that source received an async frame */
+extern void source_profiler_async_frame_received(obs_source_t *source);
+
+/* Get timestamp for start of tick */
+extern uint64_t source_profiler_source_tick_start(void);
+/* Submit start timestamp for source */
+extern void source_profiler_source_tick_end(obs_source_t *source,
+					    uint64_t start);
+
+/* Obtain GPU timer and start timestamp for render start of a source. */
+extern uint64_t source_profiler_source_render_begin(gs_timer_t **timer);
+/* Submit start timestamp and GPU timer after rendering source */
+extern void source_profiler_source_render_end(obs_source_t *source,
+					      uint64_t start,
+					      gs_timer_t *timer);
+
+/* Remove source from profiler hashmaps */
+extern void source_profiler_remove_source(obs_source_t *source);

+ 20 - 0
libobs/obs-source.c

@@ -668,6 +668,8 @@ void obs_source_destroy(struct obs_source *source)
 		obs_context_data_remove_name(&source->context,
 					     &obs->data.public_sources);
 
+	source_profiler_remove_source(source);
+
 	/* defer source destroy */
 	os_task_queue_queue_task(obs->destruction_task_thread,
 				 (os_task_t)obs_source_destroy_defer, source);
@@ -2579,6 +2581,7 @@ static void obs_source_update_async_video(obs_source_t *source)
 				source->async_update_texture = false;
 			}
 
+			source->async_last_rendered_ts = frame->timestamp;
 			obs_source_release_frame(source, frame);
 		}
 	}
@@ -2609,6 +2612,10 @@ static void rotate_async_video(obs_source_t *source, long rotation)
 static inline void obs_source_render_async_video(obs_source_t *source)
 {
 	if (source->async_textures[0] && source->async_active) {
+		gs_timer_t *timer = NULL;
+		const uint64_t start =
+			source_profiler_source_render_begin(&timer);
+
 		const enum gs_color_space source_space = convert_video_space(
 			source->async_format, source->async_trc);
 
@@ -2718,6 +2725,8 @@ static inline void obs_source_render_async_video(obs_source_t *source)
 		gs_technique_end(tech);
 
 		gs_set_linear_srgb(previous);
+
+		source_profiler_source_render_end(source, start, timer);
 	}
 }
 
@@ -2786,6 +2795,9 @@ static uint32_t get_base_height(const obs_source_t *source)
 
 static void source_render(obs_source_t *source, gs_effect_t *effect)
 {
+	gs_timer_t *timer = NULL;
+	const uint64_t start = source_profiler_source_render_begin(&timer);
+
 	void *const data = source->context.data;
 	const enum gs_color_space current_space = gs_get_color_space();
 	const enum gs_color_space source_space =
@@ -2912,6 +2924,7 @@ static void source_render(obs_source_t *source, gs_effect_t *effect)
 	} else {
 		source->info.video_render(data, effect);
 	}
+	source_profiler_source_render_end(source, start, timer);
 }
 
 void obs_source_default_render(obs_source_t *source)
@@ -3697,6 +3710,8 @@ obs_source_output_video_internal(obs_source_t *source,
 		return;
 	}
 
+	source_profiler_async_frame_received(source);
+
 	struct obs_source_frame *output = cache_video(source, frame);
 
 	/* ------------------------------------------- */
@@ -6308,3 +6323,8 @@ void obs_source_restore_filters(obs_source_t *source, obs_data_array_t *array)
 
 	da_free(cur_filters);
 }
+
+uint64_t obs_source_get_last_async_ts(const obs_source_t *source)
+{
+	return source->async_last_rendered_ts;
+}

+ 6 - 0
libobs/obs-video.c

@@ -77,7 +77,9 @@ static uint64_t tick_sources(uint64_t cur_time, uint64_t last_time)
 
 	for (size_t i = 0; i < data->sources_to_tick.num; i++) {
 		obs_source_t *s = data->sources_to_tick.array[i];
+		const uint64_t start = source_profiler_source_tick_start();
 		obs_source_video_tick(s, seconds);
+		source_profiler_source_tick_end(s, start);
 		obs_source_release(s);
 	}
 
@@ -1212,6 +1214,7 @@ bool obs_graphics_thread_loop(struct obs_graphics_context *context)
 	update_active_states();
 
 	profile_start(context->video_thread_name);
+	source_profiler_frame_begin();
 
 	gs_enter_context(obs->video.graphics);
 	gs_begin_frame();
@@ -1230,6 +1233,7 @@ bool obs_graphics_thread_loop(struct obs_graphics_context *context)
 	}
 #endif
 
+	source_profiler_render_begin();
 	profile_start(output_frame_name);
 	output_frames();
 	profile_end(output_frame_name);
@@ -1237,11 +1241,13 @@ bool obs_graphics_thread_loop(struct obs_graphics_context *context)
 	profile_start(render_displays_name);
 	render_displays();
 	profile_end(render_displays_name);
+	source_profiler_render_end();
 
 	execute_graphics_tasks();
 
 	frame_time_ns = os_gettime_ns() - frame_start;
 
+	source_profiler_frame_collect();
 	profile_end(context->video_thread_name);
 
 	profile_reenable_thread();

+ 2 - 0
libobs/obs.c

@@ -1591,6 +1591,8 @@ int obs_reset_video(struct obs_video_info *ovi)
 	     get_video_format_name(ovi->output_format),
 	     yuv ? yuv_format : "None", yuv ? "/" : "", yuv ? yuv_range : "");
 
+	source_profiler_reset_video(ovi);
+
 	return obs_init_video(ovi);
 }
 

+ 644 - 0
libobs/util/source-profiler.c

@@ -0,0 +1,644 @@
+/******************************************************************************
+    Copyright (C) 2023 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#include "source-profiler.h"
+
+#include "darray.h"
+#include "obs-internal.h"
+#include "platform.h"
+#include "threading.h"
+#include "uthash.h"
+
+struct frame_sample {
+	uint64_t tick;
+	DARRAY(uint64_t) render_cpu;
+	DARRAY(gs_timer_t *) render_timers;
+};
+
+/* Buffer frame data collection to give GPU time to finish rendering.
+ * Set to the same as the rendering buffer (NUM_TEXTURES) */
+#define FRAME_BUFFER_SIZE NUM_TEXTURES
+
+struct source_samples {
+	/* the pointer address of the source is the hashtable key */
+	uintptr_t key;
+
+	uint8_t frame_idx;
+	struct frame_sample *frames[FRAME_BUFFER_SIZE];
+
+	UT_hash_handle hh;
+};
+
+/* Basic fixed-size circular buffer to hold most recent N uint64_t values
+ * (older items will be overwritten). */
+struct ucirclebuf {
+	size_t idx;
+	size_t capacity;
+	size_t num;
+
+	uint64_t *array;
+};
+
+struct profiler_entry {
+	/* the pointer address of the source is the hashtable key */
+	uintptr_t key;
+
+	/* Tick times for last N frames */
+	struct ucirclebuf tick;
+	/* Time of first render pass in a frame, for last N frames */
+	struct ucirclebuf render_cpu;
+	struct ucirclebuf render_gpu;
+	/* Sum of all render passes in a frame, for last N frames */
+	struct ucirclebuf render_cpu_sum;
+	struct ucirclebuf render_gpu_sum;
+	/* Timestamps of last N async frame submissions */
+	struct ucirclebuf async_frame_ts;
+	/* Timestamps of last N async frames rendered */
+	struct ucirclebuf async_rendered_ts;
+
+	UT_hash_handle hh;
+};
+
+/* Hashmaps */
+struct source_samples *hm_samples = NULL;
+struct profiler_entry *hm_entries = NULL;
+
+/* GPU timer ranges (only required for DirectX) */
+static uint8_t timer_idx = 0;
+static gs_timer_range_t *timer_ranges[FRAME_BUFFER_SIZE] = {0};
+
+static uint64_t profiler_samples = 0;
+/* Sources can be rendered more than once per frame, to avoid reallocating
+ * memory in the majority of cases, reserve at least two. */
+static const size_t render_times_reservation = 2;
+
+pthread_rwlock_t hm_rwlock = PTHREAD_RWLOCK_INITIALIZER;
+
+static bool enabled = false;
+static bool gpu_enabled = false;
+/* These can be set from other threads, mark them volatile */
+static volatile bool enable_next = false;
+static volatile bool gpu_enable_next = false;
+
+void ucirclebuf_init(struct ucirclebuf *buf, size_t capacity)
+{
+	if (!capacity)
+		return;
+
+	memset(buf, 0, sizeof(struct ucirclebuf));
+	buf->capacity = capacity;
+	buf->array = bmalloc(sizeof(uint64_t) * capacity);
+}
+
+void ucirclebuf_free(struct ucirclebuf *buf)
+{
+	bfree(buf->array);
+	memset(buf, 0, sizeof(struct ucirclebuf));
+}
+
+void ucirclebuf_push(struct ucirclebuf *buf, uint64_t val)
+{
+	if (buf->num == buf->capacity) {
+		buf->idx %= buf->capacity;
+		buf->array[buf->idx++] = val;
+		return;
+	}
+
+	buf->array[buf->idx++] = val;
+	buf->num++;
+}
+
+static struct frame_sample *frame_sample_create(void)
+{
+	struct frame_sample *smp = bzalloc(sizeof(struct frame_sample));
+	da_reserve(smp->render_cpu, render_times_reservation);
+	da_reserve(smp->render_timers, render_times_reservation);
+	return smp;
+}
+
+static void frame_sample_destroy(struct frame_sample *sample)
+{
+	if (sample->render_timers.num) {
+		gs_enter_context(obs->video.graphics);
+		for (size_t i = 0; i < sample->render_timers.num; i++)
+			gs_timer_destroy(sample->render_timers.array[i]);
+		gs_leave_context();
+	}
+
+	da_free(sample->render_cpu);
+	da_free(sample->render_timers);
+	bfree(sample);
+}
+
+struct source_samples *source_samples_create(const uintptr_t key)
+{
+	struct source_samples *smps = bzalloc(sizeof(struct source_samples));
+
+	smps->key = key;
+	for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
+		smps->frames[i] = frame_sample_create();
+
+	return smps;
+}
+
+static void source_samples_destroy(struct source_samples *sample)
+{
+	for (size_t i = 0; i < FRAME_BUFFER_SIZE; i++)
+		frame_sample_destroy(sample->frames[i]);
+
+	bfree(sample);
+}
+
+static struct profiler_entry *entry_create(const uintptr_t key)
+{
+	struct profiler_entry *ent = bzalloc(sizeof(struct profiler_entry));
+	ent->key = key;
+	ucirclebuf_init(&ent->tick, profiler_samples);
+	ucirclebuf_init(&ent->render_cpu, profiler_samples);
+	ucirclebuf_init(&ent->render_gpu, profiler_samples);
+	ucirclebuf_init(&ent->render_cpu_sum, profiler_samples);
+	ucirclebuf_init(&ent->render_gpu_sum, profiler_samples);
+	ucirclebuf_init(&ent->async_frame_ts, profiler_samples);
+	ucirclebuf_init(&ent->async_rendered_ts, profiler_samples);
+	return ent;
+}
+
+static void entry_destroy(struct profiler_entry *entry)
+{
+	ucirclebuf_free(&entry->tick);
+	ucirclebuf_free(&entry->render_cpu);
+	ucirclebuf_free(&entry->render_gpu);
+	ucirclebuf_free(&entry->render_cpu_sum);
+	ucirclebuf_free(&entry->render_gpu_sum);
+	ucirclebuf_free(&entry->async_frame_ts);
+	ucirclebuf_free(&entry->async_rendered_ts);
+	bfree(entry);
+}
+
+static void reset_gpu_timers(void)
+{
+	gs_enter_context(obs->video.graphics);
+	for (int i = 0; i < FRAME_BUFFER_SIZE; i++) {
+		if (timer_ranges[i]) {
+			gs_timer_range_destroy(timer_ranges[i]);
+			timer_ranges[i] = NULL;
+		}
+	}
+	gs_leave_context();
+}
+
+static void profiler_shutdown(void)
+{
+	struct source_samples *smp, *tmp;
+	HASH_ITER (hh, hm_samples, smp, tmp) {
+		HASH_DEL(hm_samples, smp);
+		source_samples_destroy(smp);
+	}
+
+	pthread_rwlock_wrlock(&hm_rwlock);
+	struct profiler_entry *ent, *etmp;
+	HASH_ITER (hh, hm_entries, ent, etmp) {
+		HASH_DEL(hm_entries, ent);
+		entry_destroy(ent);
+	}
+	pthread_rwlock_unlock(&hm_rwlock);
+
+	reset_gpu_timers();
+}
+
+void source_profiler_enable(bool enable)
+{
+	enable_next = enable;
+}
+
+void source_profiler_gpu_enable(bool enable)
+{
+	gpu_enable_next = enable && enable_next;
+}
+
+void source_profiler_reset_video(struct obs_video_info *ovi)
+{
+	double fps = ceil((double)ovi->fps_num / (double)ovi->fps_den);
+	profiler_samples = (uint64_t)(fps * 5);
+
+	/* This is fine because the video thread won't be running at this point */
+	profiler_shutdown();
+}
+
+void source_profiler_render_begin(void)
+{
+	if (!gpu_enabled)
+		return;
+
+	gs_enter_context(obs->video.graphics);
+	if (!timer_ranges[timer_idx])
+		timer_ranges[timer_idx] = gs_timer_range_create();
+
+	gs_timer_range_begin(timer_ranges[timer_idx]);
+	gs_leave_context();
+}
+
+void source_profiler_render_end(void)
+{
+	if (!gpu_enabled || !timer_ranges[timer_idx])
+		return;
+
+	gs_enter_context(obs->video.graphics);
+	gs_timer_range_end(timer_ranges[timer_idx]);
+	gs_leave_context();
+}
+
+void source_profiler_frame_begin(void)
+{
+	if (!enabled && enable_next)
+		enabled = true;
+
+	if (!gpu_enabled && enabled && gpu_enable_next) {
+		gpu_enabled = true;
+	} else if (gpu_enabled) {
+		/* Advance timer idx if gpu enabled */
+		timer_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
+	}
+}
+
+static inline bool is_async_video_source(const struct obs_source *source)
+{
+	return (source->info.output_flags & OBS_SOURCE_ASYNC_VIDEO) ==
+	       OBS_SOURCE_ASYNC_VIDEO;
+}
+
+static const char *source_profiler_frame_collect_name =
+	"source_profiler_frame_collect";
+void source_profiler_frame_collect(void)
+{
+	if (!enabled)
+		return;
+
+	profile_start(source_profiler_frame_collect_name);
+	bool gpu_disjoint = false;
+	bool gpu_ready = false;
+	uint64_t freq = 0;
+
+	if (gpu_enabled) {
+		uint8_t timer_range_idx = (timer_idx + 1) % FRAME_BUFFER_SIZE;
+
+		if (timer_ranges[timer_range_idx]) {
+			gpu_ready = true;
+			gs_enter_context(obs->video.graphics);
+			gs_timer_range_get_data(timer_ranges[timer_range_idx],
+						&gpu_disjoint, &freq);
+		}
+
+		if (gpu_disjoint) {
+			blog(LOG_WARNING,
+			     "GPU Timers were disjoint, discarding samples.");
+		}
+	}
+
+	pthread_rwlock_wrlock(&hm_rwlock);
+
+	struct source_samples *smps = hm_samples;
+	while (smps) {
+		/* processing is delayed by FRAME_BUFFER_SIZE - 1 frames */
+		uint8_t frame_idx = (smps->frame_idx + 1) % FRAME_BUFFER_SIZE;
+		struct frame_sample *smp = smps->frames[frame_idx];
+
+		if (!smp->tick) {
+			/* No data yet */
+			smps = smps->hh.next;
+			continue;
+		}
+
+		struct profiler_entry *ent;
+		HASH_FIND_PTR(hm_entries, &smps->key, ent);
+		if (!ent) {
+			ent = entry_create(smps->key);
+			HASH_ADD_PTR(hm_entries, key, ent);
+		}
+
+		ucirclebuf_push(&ent->tick, smp->tick);
+
+		if (smp->render_cpu.num) {
+			uint64_t sum = 0;
+			for (size_t idx = 0; idx < smp->render_cpu.num; idx++) {
+				sum += smp->render_cpu.array[idx];
+			}
+			ucirclebuf_push(&ent->render_cpu,
+					smp->render_cpu.array[0]);
+			ucirclebuf_push(&ent->render_cpu_sum, sum);
+			da_clear(smp->render_cpu);
+		}
+
+		/* Note that we still check this even if GPU profiling has been
+		 * disabled to destroy leftover timers. */
+		if (smp->render_timers.num) {
+			uint64_t sum = 0, first = 0, ticks = 0;
+
+			for (size_t i = 0; i < smp->render_timers.num; i++) {
+				gs_timer_t *timer = smp->render_timers.array[i];
+
+				if (gpu_ready && !gpu_disjoint &&
+				    gs_timer_get_data(timer, &ticks)) {
+					/* Convert ticks to ns */
+					sum += util_mul_div64(
+						ticks, 1000000000ULL, freq);
+					if (!first)
+						first = sum;
+				}
+
+				gs_timer_destroy(timer);
+			}
+
+			if (first) {
+				ucirclebuf_push(&ent->render_gpu, first);
+				ucirclebuf_push(&ent->render_gpu_sum, sum);
+			}
+			da_clear(smp->render_timers);
+		}
+
+		const obs_source_t *src = *(const obs_source_t **)smps->hh.key;
+		if (is_async_video_source(src)) {
+			uint64_t ts = obs_source_get_last_async_ts(src);
+			ucirclebuf_push(&ent->async_rendered_ts, ts);
+		}
+
+		smps = smps->hh.next;
+	}
+
+	pthread_rwlock_unlock(&hm_rwlock);
+
+	if (gpu_enabled && gpu_ready)
+		gs_leave_context();
+
+	/* Apply updated states for next frame */
+	if (!enable_next) {
+		enabled = gpu_enabled = false;
+		profiler_shutdown();
+	} else if (!gpu_enable_next) {
+		gpu_enabled = false;
+		reset_gpu_timers();
+	}
+
+	profile_end(source_profiler_frame_collect_name);
+}
+
+void source_profiler_async_frame_received(obs_source_t *source)
+{
+	if (!enabled)
+		return;
+
+	uint64_t ts = os_gettime_ns();
+
+	pthread_rwlock_wrlock(&hm_rwlock);
+
+	struct profiler_entry *ent;
+	HASH_FIND_PTR(hm_entries, &source, ent);
+	if (ent)
+		ucirclebuf_push(&ent->async_frame_ts, ts);
+
+	pthread_rwlock_unlock(&hm_rwlock);
+}
+
+uint64_t source_profiler_source_tick_start(void)
+{
+	if (!enabled)
+		return 0;
+
+	return os_gettime_ns();
+}
+
+void source_profiler_source_tick_end(obs_source_t *source, uint64_t start)
+{
+	if (!enabled)
+		return;
+
+	const uint64_t delta = os_gettime_ns() - start;
+
+	struct source_samples *smp = NULL;
+	HASH_FIND_PTR(hm_samples, &source, smp);
+	if (!smp) {
+		smp = source_samples_create((uintptr_t)source);
+		HASH_ADD_PTR(hm_samples, key, smp);
+	} else {
+		/* Advance index here since tick happens first and only once
+		 * at the start of each frame. */
+		smp->frame_idx = (smp->frame_idx + 1) % FRAME_BUFFER_SIZE;
+	}
+
+	smp->frames[smp->frame_idx]->tick = delta;
+}
+
+uint64_t source_profiler_source_render_begin(gs_timer_t **timer)
+{
+	if (!enabled)
+		return 0;
+
+	if (gpu_enabled) {
+		*timer = gs_timer_create();
+		gs_timer_begin(*timer);
+	} else {
+		*timer = NULL;
+	}
+
+	return os_gettime_ns();
+}
+
+void source_profiler_source_render_end(obs_source_t *source, uint64_t start,
+				       gs_timer_t *timer)
+{
+	if (!enabled)
+		return;
+	if (timer)
+		gs_timer_end(timer);
+
+	const uint64_t delta = os_gettime_ns() - start;
+
+	struct source_samples *smp;
+	HASH_FIND_PTR(hm_samples, &source, smp);
+
+	if (smp) {
+		da_push_back(smp->frames[smp->frame_idx]->render_cpu, &delta);
+		if (timer) {
+			da_push_back(smp->frames[smp->frame_idx]->render_timers,
+				     &timer);
+		}
+	} else if (timer) {
+		gs_timer_destroy(timer);
+	}
+}
+
+static void task_delete_source(void *key)
+{
+	struct source_samples *smp;
+	HASH_FIND_PTR(hm_samples, &key, smp);
+	if (smp) {
+		HASH_DEL(hm_samples, smp);
+		source_samples_destroy(smp);
+	}
+
+	pthread_rwlock_rdlock(&hm_rwlock);
+	struct profiler_entry *ent = NULL;
+	HASH_FIND_PTR(hm_entries, &key, ent);
+	if (ent) {
+		HASH_DEL(hm_entries, ent);
+		entry_destroy(ent);
+	}
+	pthread_rwlock_unlock(&hm_rwlock);
+}
+
+void source_profiler_remove_source(obs_source_t *source)
+{
+	if (!enabled)
+		return;
+	/* Schedule deletion task on graphics thread */
+	obs_queue_task(OBS_TASK_GRAPHICS, task_delete_source, source, false);
+}
+
+static inline void calculate_tick(struct profiler_entry *ent,
+				  struct profiler_result *result)
+{
+	size_t idx = 0;
+	uint64_t sum = 0;
+
+	for (; idx < ent->tick.num; idx++) {
+		const uint64_t delta = ent->tick.array[idx];
+		if (delta > result->tick_max)
+			result->tick_max = delta;
+
+		sum += delta;
+	}
+
+	if (idx)
+		result->tick_avg = sum / idx;
+}
+
+static inline void calculate_render(struct profiler_entry *ent,
+				    struct profiler_result *result)
+{
+	size_t idx;
+	uint64_t sum = 0, sum_sum = 0;
+
+	for (idx = 0; idx < ent->render_cpu.num; idx++) {
+		const uint64_t delta = ent->render_cpu.array[idx];
+		if (delta > result->render_max)
+			result->render_max = delta;
+
+		sum += delta;
+		sum_sum += ent->render_cpu_sum.array[idx];
+	}
+
+	if (idx) {
+		result->render_avg = sum / idx;
+		result->render_sum = sum_sum / idx;
+	}
+
+	if (!gpu_enabled)
+		return;
+
+	sum = sum_sum = 0;
+	for (idx = 0; idx < ent->render_gpu.num; idx++) {
+		const uint64_t delta = ent->render_gpu.array[idx];
+		if (delta > result->render_gpu_max)
+			result->render_gpu_max = delta;
+
+		sum += delta;
+		sum_sum += ent->render_gpu_sum.array[idx];
+	}
+
+	if (idx) {
+		result->render_gpu_avg = sum / idx;
+		result->render_gpu_sum = sum_sum / idx;
+	}
+}
+
+static inline void calculate_fps(const struct ucirclebuf *frames, double *avg,
+				 uint64_t *best, uint64_t *worst)
+{
+	uint64_t deltas = 0, delta_sum = 0, best_delta = 0, worst_delta = 0;
+
+	for (size_t idx = 0; idx < frames->num; idx++) {
+		const uint64_t ts = frames->array[idx];
+		if (!ts)
+			break;
+
+		size_t prev_idx = idx ? idx - 1 : frames->num - 1;
+		const uint64_t prev_ts = frames->array[prev_idx];
+		if (!prev_ts || prev_ts >= ts)
+			continue;
+
+		uint64_t delta = (ts - prev_ts);
+		if (delta < best_delta || !best_delta)
+			best_delta = delta;
+		if (delta > worst_delta)
+			worst_delta = delta;
+
+		delta_sum += delta;
+		deltas++;
+	}
+
+	if (deltas && delta_sum) {
+		*avg = 1.0E9 / ((double)delta_sum / (double)deltas);
+		*best = best_delta;
+		*worst = worst_delta;
+	}
+}
+
+bool source_profiler_fill_result(obs_source_t *source,
+				 struct profiler_result *result)
+{
+	if (!enabled || !result)
+		return false;
+
+	memset(result, 0, sizeof(struct profiler_result));
+	/* No or only stale data available */
+	if (!obs_source_enabled(source))
+		return true;
+
+	pthread_rwlock_rdlock(&hm_rwlock);
+
+	struct profiler_entry *ent = NULL;
+	HASH_FIND_PTR(hm_entries, &source, ent);
+	if (ent) {
+		calculate_tick(ent, result);
+		calculate_render(ent, result);
+
+		if (is_async_video_source(source)) {
+			calculate_fps(&ent->async_frame_ts,
+				      &result->async_input,
+				      &result->async_input_best,
+				      &result->async_input_worst);
+			calculate_fps(&ent->async_rendered_ts,
+				      &result->async_rendered,
+				      &result->async_rendered_best,
+				      &result->async_rendered_worst);
+		}
+	}
+
+	pthread_rwlock_unlock(&hm_rwlock);
+
+	return !!ent;
+}
+
+profiler_result_t *source_profiler_get_result(obs_source_t *source)
+{
+	profiler_result_t *ret = bmalloc(sizeof(profiler_result_t));
+	if (!source_profiler_fill_result(source, ret)) {
+		bfree(ret);
+		return NULL;
+	}
+	return ret;
+}

+ 67 - 0
libobs/util/source-profiler.h

@@ -0,0 +1,67 @@
+/******************************************************************************
+    Copyright (C) 2023 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#pragma once
+
+#include "obs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct profiler_result {
+	/* Tick times in ns */
+	uint64_t tick_avg;
+	uint64_t tick_max;
+
+	/* Average and max render times for CPU and GPU in ns */
+	uint64_t render_avg;
+	uint64_t render_max;
+	uint64_t render_gpu_avg;
+	uint64_t render_gpu_max;
+
+	/* Average of the sum of all render passes in a frame in ns
+	 * (a source can be rendered more than once per frame). */
+	uint64_t render_sum;
+	uint64_t render_gpu_sum;
+
+	/* FPS of submitted async input */
+	double async_input;
+	/* Actually rendered async frames */
+	double async_rendered;
+
+	/* Best and worst frame times of input/output in ns */
+	uint64_t async_input_best;
+	uint64_t async_input_worst;
+	uint64_t async_rendered_best;
+	uint64_t async_rendered_worst;
+} profiler_result_t;
+
+/* Enable/disable profiler (applied on next frame) */
+EXPORT void source_profiler_enable(bool enable);
+/* Enable/disable GPU profiling (applied on next frame) */
+EXPORT void source_profiler_gpu_enable(bool enable);
+
+/* Get latest profiling results for source (must be freed by user) */
+EXPORT profiler_result_t *source_profiler_get_result(obs_source_t *source);
+/* Update existing profiler results object for source */
+EXPORT bool source_profiler_fill_result(obs_source_t *source,
+					profiler_result_t *result);
+
+#ifdef __cplusplus
+}
+#endif