Browse Source

Merge pull request #10608 from derrod/obs-mp4-muxer

Native Hybrid MP4 Muxer
Lain 1 year ago
parent
commit
f71a67cd3a

+ 17 - 0
UI/api-interface.cpp

@@ -337,6 +337,23 @@ struct OBSStudioAPI : obs_frontend_callbacks {
 		}
 	}
 
+	bool obs_frontend_recording_add_chapter(const char *name) override
+	{
+		if (!os_atomic_load_bool(&recording_active) ||
+		    os_atomic_load_bool(&recording_paused))
+			return false;
+
+		proc_handler_t *ph = obs_output_get_proc_handler(
+			main->outputHandler->fileOutput);
+
+		calldata cd;
+		calldata_init(&cd);
+		calldata_set_string(&cd, "chapter_name", name);
+		bool result = proc_handler_call(ph, "add_chapter", &cd);
+		calldata_free(&cd);
+		return result;
+	}
+
 	void obs_frontend_replay_buffer_start(void) override
 	{
 		QMetaObject::invokeMethod(main, "StartReplayBuffer");

+ 2 - 0
UI/data/locale/en-US.ini

@@ -732,6 +732,7 @@ Basic.Main.StopRecording="Stop Recording"
 Basic.Main.PauseRecording="Pause Recording"
 Basic.Main.UnpauseRecording="Unpause Recording"
 Basic.Main.SplitFile="Split Recording File"
+Basic.Main.AddChapterMarker="Add Chapter Marker"
 Basic.Main.StoppingRecording="Stopping Recording..."
 Basic.Main.StopReplayBuffer="Stop Replay Buffer"
 Basic.Main.StoppingReplayBuffer="Stopping Replay Buffer..."
@@ -966,6 +967,7 @@ Basic.Settings.Output.Format.MP4="MPEG-4 (.mp4)"
 Basic.Settings.Output.Format.MOV="QuickTime (.mov)"
 Basic.Settings.Output.Format.TS="MPEG-TS (.ts)"
 Basic.Settings.Output.Format.HLS="HLS (.m3u8 + .ts)"
+Basic.Settings.Output.Format.hMP4="Hybrid MP4 [BETA] (.mp4)"
 Basic.Settings.Output.Format.fMP4="Fragmented MP4 (.mp4)"
 Basic.Settings.Output.Format.fMOV="Fragmented MOV (.mov)"
 Basic.Settings.Output.Format.TT.fragmented_mov="Fragmented MOV writes the recording in chunks and does not require the same finalization as traditional MOV files.\nThis ensures the file remains playable even if writing to disk is interrupted, for example, as a result of a BSOD or power loss.\n\nThis may not be compatible with all players and editors. Use File → Remux Recordings to convert the file into a more compatible format if necessary."

+ 14 - 0
UI/ffmpeg-utils.cpp

@@ -185,6 +185,20 @@ static const unordered_map<string, unordered_set<string>> codec_compat = {
 		 "pcm_f32le",
 #endif
 	 }},
+	// Not part of FFmpeg, see obs-outputs module
+	{"hybrid_mp4",
+	 {
+		 "h264",
+		 "hevc",
+		 "av1",
+		 "aac",
+		 "opus",
+		 "alac",
+		 "flac",
+		 "pcm_s16le",
+		 "pcm_s24le",
+		 "pcm_f32le",
+	 }},
 	{"mov",
 	 {
 		 "h264",

+ 2 - 0
UI/obs-app.cpp

@@ -1783,6 +1783,8 @@ string GetFormatExt(const char *container)
 	string ext = container;
 	if (ext == "fragmented_mp4")
 		ext = "mp4";
+	if (ext == "hybrid_mp4")
+		ext = "mp4";
 	else if (ext == "fragmented_mov")
 		ext = "mov";
 	else if (ext == "hls")

+ 6 - 0
UI/obs-frontend-api/obs-frontend-api.cpp

@@ -285,6 +285,12 @@ bool obs_frontend_recording_split_file(void)
 				   : false;
 }
 
+bool obs_frontend_recording_add_chapter(const char *name)
+{
+	return !!callbacks_valid() ? c->obs_frontend_recording_add_chapter(name)
+				   : false;
+}
+
 void obs_frontend_replay_buffer_start(void)
 {
 	if (callbacks_valid())

+ 1 - 0
UI/obs-frontend-api/obs-frontend-api.h

@@ -190,6 +190,7 @@ EXPORT bool obs_frontend_recording_active(void);
 EXPORT void obs_frontend_recording_pause(bool pause);
 EXPORT bool obs_frontend_recording_paused(void);
 EXPORT bool obs_frontend_recording_split_file(void);
+EXPORT bool obs_frontend_recording_add_chapter(const char *name);
 
 EXPORT void obs_frontend_replay_buffer_start(void);
 EXPORT void obs_frontend_replay_buffer_save(void);

+ 1 - 0
UI/obs-frontend-api/obs-frontend-internal.hpp

@@ -53,6 +53,7 @@ struct obs_frontend_callbacks {
 	virtual void obs_frontend_recording_pause(bool pause) = 0;
 	virtual bool obs_frontend_recording_paused(void) = 0;
 	virtual bool obs_frontend_recording_split_file(void) = 0;
+	virtual bool obs_frontend_recording_add_chapter(const char *name) = 0;
 
 	virtual void obs_frontend_replay_buffer_start(void) = 0;
 	virtual void obs_frontend_replay_buffer_save(void) = 0;

+ 11 - 2
UI/window-basic-main-outputs.cpp

@@ -697,6 +697,9 @@ SimpleOutput::SimpleOutput(OBSBasic *main_) : BasicOutputHandler(main_)
 	if (!ffmpegOutput) {
 		bool useReplayBuffer = config_get_bool(main->Config(),
 						       "SimpleOutput", "RecRB");
+		const char *recFormat = config_get_string(
+			main->Config(), "SimpleOutput", "RecFormat2");
+
 		if (useReplayBuffer) {
 			OBSDataAutoRelease hotkey;
 			const char *str = config_get_string(
@@ -728,8 +731,10 @@ SimpleOutput::SimpleOutput(OBSBasic *main_) : BasicOutputHandler(main_)
 						  OBSReplayBufferSaved, this);
 		}
 
+		bool use_native = strcmp(recFormat, "hybrid_mp4") == 0;
 		fileOutput = obs_output_create(
-			"ffmpeg_muxer", "simple_file_output", nullptr, nullptr);
+			use_native ? "mp4_output" : "ffmpeg_muxer",
+			"simple_file_output", nullptr, nullptr);
 		if (!fileOutput)
 			throw "Failed to create recording output "
 			      "(simple output)";
@@ -1568,6 +1573,8 @@ AdvancedOutput::AdvancedOutput(OBSBasic *main_) : BasicOutputHandler(main_)
 		config_get_string(main->Config(), "AdvOut", "RecEncoder");
 	const char *recAudioEncoder =
 		config_get_string(main->Config(), "AdvOut", "RecAudioEncoder");
+	const char *recFormat =
+		config_get_string(main->Config(), "AdvOut", "RecFormat2");
 #ifdef __APPLE__
 	translate_macvth264_encoder(streamEncoder);
 	translate_macvth264_encoder(recordEncoder);
@@ -1623,8 +1630,10 @@ AdvancedOutput::AdvancedOutput(OBSBasic *main_) : BasicOutputHandler(main_)
 						  OBSReplayBufferSaved, this);
 		}
 
+		bool native_muxer = strcmp(recFormat, "hybrid_mp4") == 0;
 		fileOutput = obs_output_create(
-			"ffmpeg_muxer", "adv_file_output", nullptr, nullptr);
+			native_muxer ? "mp4_output" : "ffmpeg_muxer",
+			"adv_file_output", nullptr, nullptr);
 		if (!fileOutput)
 			throw "Failed to create recording output "
 			      "(advanced output)";

+ 18 - 0
UI/window-basic-main.cpp

@@ -2808,6 +2808,23 @@ void OBSBasic::CreateHotkeys()
 		this);
 	LoadHotkey(splitFileHotkey, "OBSBasic.SplitFile");
 
+	/* Adding chapters is only supported by the native MP4 output */
+	const string_view output_id =
+		obs_output_get_id(outputHandler->fileOutput);
+	if (output_id == "mp4_output") {
+		addChapterHotkey = obs_hotkey_register_frontend(
+			"OBSBasic.AddChapterMarker",
+			Str("Basic.Main.AddChapterMarker"),
+			[](void *, obs_hotkey_id, obs_hotkey_t *,
+			   bool pressed) {
+				if (pressed)
+					obs_frontend_recording_add_chapter(
+						nullptr);
+			},
+			this);
+		LoadHotkey(addChapterHotkey, "OBSBasic.AddChapterMarker");
+	}
+
 	replayBufHotkeys = obs_hotkey_pair_register_frontend(
 		"OBSBasic.StartReplayBuffer",
 		Str("Basic.Main.StartReplayBuffer"),
@@ -2936,6 +2953,7 @@ void OBSBasic::ClearHotkeys()
 	obs_hotkey_pair_unregister(recordingHotkeys);
 	obs_hotkey_pair_unregister(pauseHotkeys);
 	obs_hotkey_unregister(splitFileHotkey);
+	obs_hotkey_unregister(addChapterHotkey);
 	obs_hotkey_pair_unregister(replayBufHotkeys);
 	obs_hotkey_pair_unregister(vcamHotkeys);
 	obs_hotkey_pair_unregister(togglePreviewHotkeys);

+ 2 - 1
UI/window-basic-main.hpp

@@ -459,7 +459,8 @@ private:
 	obs_hotkey_pair_id streamingHotkeys, recordingHotkeys, pauseHotkeys,
 		replayBufHotkeys, vcamHotkeys, togglePreviewHotkeys,
 		contextBarHotkeys;
-	obs_hotkey_id forceStreamingStopHotkey, splitFileHotkey;
+	obs_hotkey_id forceStreamingStopHotkey, splitFileHotkey,
+		addChapterHotkey;
 
 	void InitDefaultTransitions();
 	void InitTransition(obs_source_t *transition);

+ 2 - 0
UI/window-basic-settings.cpp

@@ -1157,6 +1157,7 @@ void OBSBasicSettings::LoadFormats()
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("MKV"), "mkv");
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("MP4"), "mp4");
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("MOV"), "mov");
+	ui->simpleOutRecFormat->addItem(FORMAT_STR("hMP4"), "hybrid_mp4");
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("fMP4"), "fragmented_mp4");
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("fMOV"), "fragmented_mov");
 	ui->simpleOutRecFormat->addItem(FORMAT_STR("TS"), "mpegts");
@@ -1165,6 +1166,7 @@ void OBSBasicSettings::LoadFormats()
 	ui->advOutRecFormat->addItem(FORMAT_STR("MKV"), "mkv");
 	ui->advOutRecFormat->addItem(FORMAT_STR("MP4"), "mp4");
 	ui->advOutRecFormat->addItem(FORMAT_STR("MOV"), "mov");
+	ui->advOutRecFormat->addItem(FORMAT_STR("hMP4"), "hybrid_mp4");
 	ui->advOutRecFormat->addItem(FORMAT_STR("fMP4"), "fragmented_mp4");
 	ui->advOutRecFormat->addItem(FORMAT_STR("fMOV"), "fragmented_mov");
 	ui->advOutRecFormat->addItem(FORMAT_STR("TS"), "mpegts");

+ 5 - 0
plugins/obs-outputs/CMakeLists.txt

@@ -36,6 +36,10 @@ target_sources(
           librtmp/rtmp.c
           librtmp/rtmp.h
           librtmp/rtmp_sys.h
+          mp4-mux-internal.h
+          mp4-mux.c
+          mp4-mux.h
+          mp4-output.c
           net-if.c
           net-if.h
           null-output.c
@@ -60,6 +64,7 @@ target_link_libraries(
   obs-outputs
   PRIVATE OBS::libobs
           OBS::happy-eyeballs
+          OBS::opts-parser
           MbedTLS::MbedTLS
           ZLIB::ZLIB
           $<$<PLATFORM_ID:Windows>:OBS::w32-pthreads>

+ 5 - 0
plugins/obs-outputs/data/locale/en-US.ini

@@ -7,6 +7,11 @@ FLVOutput="FLV File Output"
 FLVOutput.FilePath="File Path"
 Default="Default"
 
+MP4Output="MP4 File Output"
+MP4Output.FilePath="File Path"
+MP4Output.StartChapter="Start"
+MP4Output.UnnamedChapter="Unnamed"
+
 IPFamily="IP Address Family"
 IPFamily.Both="IPv4 and IPv6 (Default)"
 IPFamily.V4Only="IPv4 Only"

+ 345 - 0
plugins/obs-outputs/mp4-mux-internal.h

@@ -0,0 +1,345 @@
+/******************************************************************************
+    Copyright (C) 2024 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#pragma once
+
+#include "mp4-mux.h"
+
+#include <util/darray.h>
+#include <util/deque.h>
+#include <util/serializer.h>
+
+/* Flavour for target compatibility */
+enum mp4_flavour {
+	MP4,  /* ISO/IEC 14496-12 */
+	MOV,  /* Apple QuickTime */
+	CMAF, /* ISO/IEC 23000-19 */
+};
+
+enum mp4_track_type {
+	TRACK_UNKNOWN,
+	TRACK_VIDEO,
+	TRACK_AUDIO,
+	TRACK_CHAPTERS,
+};
+
+enum mp4_codec {
+	CODEC_UNKNOWN,
+
+	/* Video Codecs */
+	CODEC_H264,
+	CODEC_HEVC,
+	CODEC_AV1,
+
+	/* Audio Codecs */
+	CODEC_AAC,
+	CODEC_OPUS,
+	CODEC_FLAC,
+	CODEC_ALAC,
+	CODEC_PCM_I16,
+	CODEC_PCM_I24,
+	CODEC_PCM_F32,
+
+	/* Text/Chapter trakcs */
+	CODEC_TEXT,
+};
+
+struct chunk {
+	uint64_t offset;
+	uint32_t size;
+	uint32_t samples;
+};
+
+struct sample_delta {
+	uint32_t count;
+	uint32_t delta;
+};
+
+struct sample_offset {
+	uint32_t count;
+	int32_t offset;
+};
+
+struct fragment_sample {
+	uint32_t size;
+	int32_t offset;
+	uint32_t duration;
+};
+
+struct mp4_track {
+	enum mp4_track_type type;
+	enum mp4_codec codec;
+
+	/* Track ID in container */
+	uint8_t track_id;
+	/* Number of samples for this track  */
+	uint64_t samples;
+	/* Duration for this track  */
+	uint64_t duration;
+
+	/* Encoder associated with this track */
+	obs_encoder_t *encoder;
+
+	/* Time Base (1/FPS for video, 1/sample rate for audio) */
+	uint32_t timebase_num;
+	uint32_t timebase_den;
+	/* Output timescale calculated from time base (Video only) */
+	uint32_t timescale;
+
+	/* First PTS this track has seen (in track timescale) */
+	int64_t first_pts;
+	/* Highest PTS this track has seen (in usec) */
+	int64_t last_pts_usec;
+
+	/* deque of encoder_packet belonging to this track */
+	struct deque packets;
+
+	/* Sample sizes (fixed for PCM) */
+	uint32_t sample_size;
+	DARRAY(uint32_t) sample_sizes;
+	/* Data chunks in file containing samples for this track */
+	DARRAY(struct chunk) chunks;
+	/* Time delta between samples */
+	DARRAY(struct sample_delta) deltas;
+
+	/* Sample CT-DT offset, i.e. DTS-PTS offset (Video only) */
+	bool needs_ctts;
+	int32_t dts_offset;
+	DARRAY(struct sample_offset) offsets;
+	/* Sync samples, i.e. keyframes (Video only) */
+	DARRAY(uint32_t) sync_samples;
+
+	/* Temporary array with information about the samples to be included
+	 * in the next fragment. */
+	DARRAY(struct fragment_sample) fragment_samples;
+};
+
+struct mp4_mux {
+	obs_output_t *output;
+	struct serializer *serializer;
+
+	/* Target format compatibility */
+	enum mp4_flavour mode;
+
+	/* Flags */
+	enum mp4_mux_flags flags;
+
+	uint32_t fragments_written;
+	/* PTS where next fragmentation should take place */
+	int64_t next_frag_pts;
+
+	/* Creation time (seconds since Jan 1 1904) */
+	uint64_t creation_time;
+
+	/* Offset of placeholder atom/box to contain final mdat header */
+	size_t placeholder_offset;
+
+	uint8_t track_ctr;
+	/* Audio/Video tracks */
+	DARRAY(struct mp4_track) tracks;
+	/* Special tracks */
+	struct mp4_track *chapter_track;
+};
+
+/* clang-format off */
+// Defined in ISO/IEC 14496-12:2015 Section 8.2.2.1
+const int32_t UNITY_MATRIX[9] = {
+	0x00010000,	0,		0,
+	0,		0x00010000,	0,
+	0,		0,		0x40000000
+};
+/* clang-format on */
+
+enum tfhd_flags {
+	BASE_DATA_OFFSET_PRESENT = 0x000001,
+	SAMPLE_DESCRIPTION_INDEX_PRESENT = 0x000002,
+	DEFAULT_SAMPLE_DURATION_PRESENT = 0x000008,
+	DEFAULT_SAMPLE_SIZE_PRESENT = 0x000010,
+	DEFAULT_SAMPLE_FLAGS_PRESENT = 0x000020,
+	DURATION_IS_EMPTY = 0x010000,
+	DEFAULT_BASE_IS_MOOF = 0x020000,
+};
+
+enum trun_flags {
+	DATA_OFFSET_PRESENT = 0x000001,
+	FIRST_SAMPLE_FLAGS_PRESENT = 0x000004,
+	SAMPLE_DURATION_PRESENT = 0x000100,
+	SAMPLE_SIZE_PRESENT = 0x000200,
+	SAMPLE_FLAGS_PRESENT = 0x000400,
+	SAMPLE_COMPOSITION_TIME_OFFSETS_PRESENT = 0x000800,
+};
+
+/*
+ * ISO Standard structure (big endian so we can't easily use it):
+ *
+ * struct sample_flags {
+ * 	uint32_t reserved : 4;
+ * 	uint32_t is_leading : 2;
+ *	uint32_t sample_depends_on : 2;
+ *	uint32_t sample_is_depended_on : 2;
+ *	uint32_t sample_has_redundancy : 2;
+ *	uint32_t sample_padding_value : 3;
+ *	uint32_t sample_is_non_sync_sample : 1;
+ *	uint32_t sample_degradation_priority : 16;
+};
+*/
+
+enum sample_flags {
+	SAMPLE_FLAG_IS_NON_SYNC = 0x00010000,
+	SAMPLE_FLAG_DEPENDS_YES = 0x01000000,
+	SAMPLE_FLAG_DEPENDS_NO = 0x02000000,
+};
+
+#ifndef _WIN32
+static inline size_t min(size_t a, size_t b)
+{
+	return a < b ? a : b;
+}
+#endif
+
+static inline void get_speaker_positions(enum speaker_layout layout,
+					 uint8_t *arr, uint8_t *size,
+					 uint8_t *iso_layout)
+{
+	switch (layout) {
+	case SPEAKERS_MONO:
+		arr[0] = 2; // FC
+		*size = 1;
+		*iso_layout = 1;
+		break;
+	case SPEAKERS_UNKNOWN:
+	case SPEAKERS_STEREO:
+		arr[0] = 0; // FL
+		arr[1] = 1; // FR
+		*size = 2;
+		*iso_layout = 2;
+		break;
+	case SPEAKERS_2POINT1:
+		arr[0] = 0; // FL
+		arr[1] = 1; // FR
+		arr[2] = 3; // LFE
+		*size = 3;
+		break;
+	case SPEAKERS_4POINT0:
+		arr[0] = 0;  // FL
+		arr[1] = 1;  // FR
+		arr[2] = 2;  // FC
+		arr[3] = 10; // RC
+		*size = 4;
+		*iso_layout = 4;
+		break;
+	case SPEAKERS_4POINT1:
+		arr[0] = 0;  // FL
+		arr[1] = 1;  // FR
+		arr[2] = 2;  // FC
+		arr[3] = 3;  // LFE
+		arr[4] = 10; // RC
+		*size = 5;
+		break;
+	case SPEAKERS_5POINT1:
+		arr[0] = 0; // FL
+		arr[1] = 1; // FR
+		arr[2] = 2; // FC
+		arr[3] = 3; // LFE
+		arr[4] = 8; // RL
+		arr[5] = 9; // RR
+		*size = 6;
+		break;
+	case SPEAKERS_7POINT1:
+		arr[0] = 0;  // FL
+		arr[1] = 1;  // FR
+		arr[2] = 2;  // FC
+		arr[3] = 3;  // LFE
+		arr[4] = 8;  // RL
+		arr[5] = 9;  // RR
+		arr[6] = 13; // SL
+		arr[7] = 14; // SR
+		*size = 8;
+		*iso_layout = 12;
+		break;
+	}
+}
+
+static inline void get_colour_information(obs_encoder_t *enc, uint16_t *pri,
+					  uint16_t *trc, uint16_t *spc,
+					  uint8_t *full_range)
+{
+	video_t *video = obs_encoder_video(enc);
+	const struct video_output_info *info = video_output_get_info(video);
+
+	*full_range = info->range == VIDEO_RANGE_FULL ? 1 : 0;
+
+	switch (info->colorspace) {
+	case VIDEO_CS_601:
+		*pri = 6; // OBSCOL_PRI_SMPTE170M
+		*trc = 6;
+		*spc = 6;
+		break;
+	case VIDEO_CS_DEFAULT:
+	case VIDEO_CS_709:
+		*pri = 1; // OBSCOL_PRI_BT709
+		*trc = 1;
+		*spc = 1;
+		break;
+	case VIDEO_CS_SRGB:
+		*pri = 1;  // OBSCOL_PRI_BT709
+		*trc = 13; // OBSCOL_TRC_IEC61966_2_1
+		*spc = 1;  // OBSCOL_PRI_BT709
+		break;
+	case VIDEO_CS_2100_PQ:
+		*pri = 9;  // OBSCOL_PRI_BT2020
+		*trc = 16; // OBSCOL_TRC_SMPTE2084
+		*spc = 9;  // OBSCOL_SPC_BT2020_NCL
+		break;
+	case VIDEO_CS_2100_HLG:
+		*pri = 9;  // OBSCOL_PRI_BT2020
+		*trc = 18; // OBSCOL_TRC_ARIB_STD_B67
+		*spc = 9;  // OBSCOL_SPC_BT2020_NCL
+	}
+}
+
+/* Chapter stubs (from libavformat/movenc.c) */
+
+static const uint8_t TEXT_STUB_HEADER[] = {
+	// TextSampleEntry
+	0x00, 0x00, 0x00, 0x01, // displayFlags
+	0x00, 0x00,             // horizontal + vertical justification
+	0x00, 0x00, 0x00, 0x00, // bgColourRed/Green/Blue/Alpha
+	// BoxRecord
+	0x00, 0x00, 0x00, 0x00, // defTextBoxTop/Left
+	0x00, 0x00, 0x00, 0x00, // defTextBoxBottom/Right
+	// StyleRecord
+	0x00, 0x00, 0x00, 0x00, // startChar + endChar
+	0x00, 0x01,             // fontID
+	0x00, 0x00,             // fontStyleFlags + fontSize
+	0x00, 0x00, 0x00, 0x00, // fgColourRed/Green/Blue/Alpha
+	// FontTableBox
+	0x00, 0x00, 0x00, 0x0D, // box size
+	'f', 't', 'a', 'b',     // box atom name
+	0x00, 0x01,             // entry count
+	// FontRecord
+	0x00, 0x01, // font ID
+	0x00,       // font name length
+};
+
+/* clang-format off */
+static const char CHAPTER_PKT_FOOTER[12] = {
+	0x00, 0x00, 0x00, 0x0C,
+	'e',  'n',  'c',  'd',
+	0x00, 0x00, 0x01, 0x00
+};
+/* clang-format on */

+ 2843 - 0
plugins/obs-outputs/mp4-mux.c

@@ -0,0 +1,2843 @@
+/******************************************************************************
+    Copyright (C) 2024 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#include "mp4-mux-internal.h"
+
+#include "rtmp-hevc.h"
+#include "rtmp-av1.h"
+
+#include <obs-avc.h>
+#include <obs-hevc.h>
+#include <obs-module.h>
+#include <util/dstr.h>
+#include <util/platform.h>
+#include <util/array-serializer.h>
+
+#include <time.h>
+
+/*
+ * (Mostly) compliant MP4 muxer for fun and profit.
+ * Based on ISO/IEC 14496-12 and FFmpeg's libavformat/movenc.c ([L]GPL)
+ *
+ * Specification section numbers are noted where applicable.
+ * Standard identifier is included if not referring to ISO/IEC 14496-12.
+ */
+
+#define do_log(level, format, ...)               \
+	blog(level, "[mp4 muxer: '%s'] " format, \
+	     obs_output_get_name(mux->output), ##__VA_ARGS__)
+
+#define warn(format, ...) do_log(LOG_WARNING, format, ##__VA_ARGS__)
+#define info(format, ...) do_log(LOG_INFO, format, ##__VA_ARGS__)
+
+/* Helper to overwrite placeholder size and return total size. */
+static inline size_t write_box_size(struct serializer *s, int64_t start)
+{
+	int64_t end = serializer_get_pos(s);
+	size_t size = end - start;
+
+	serializer_seek(s, start, SERIALIZE_SEEK_START);
+	s_wb32(s, (uint32_t)size);
+	serializer_seek(s, end, SERIALIZE_SEEK_START);
+
+	return size;
+}
+
+/// 4.2 Box header with size and char[4] name
+static inline void write_box(struct serializer *s, const size_t size,
+			     const char name[4])
+{
+	if (size <= UINT32_MAX) {
+		s_wb32(s, (uint32_t)size); // size
+		s_write(s, name, 4);       // boxtype
+	} else {
+		s_wb32(s, 1);        // size
+		s_write(s, name, 4); // boxtype
+		s_wb64(s, size);     // largesize
+	}
+}
+
+/// 4.2 FullBox extended header with u8 version and u24 flags
+static inline void write_fullbox(struct serializer *s, const size_t size,
+				 const char name[4], uint8_t version,
+				 uint32_t flags)
+{
+	write_box(s, size, name);
+	s_w8(s, version);
+	s_wb24(s, flags);
+}
+
+/// 4.3 File Type Box
+static size_t mp4_write_ftyp(struct mp4_mux *mux, bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "ftyp");
+
+	const char *major_brand = "isom";
+	/* Following FFmpeg's example, when using negative CTS the major brand
+	 * needs to be either iso4 or iso6 depending on whether the file is
+	 * currently fragmented. */
+	if (mux->flags & MP4_USE_NEGATIVE_CTS)
+		major_brand = fragmented ? "iso6" : "iso4";
+
+	s_write(s, major_brand, 4); // major brand
+	s_wb32(s, 512);             // minor version
+
+	// minor brands (first one matches major brand)
+	s_write(s, major_brand, 4);
+
+	/* Write isom base brand if it's not the major brand */
+	if (strcmp(major_brand, "isom") != 0)
+		s_write(s, "isom", 4);
+
+	/* Avoid adding newer brand (iso6) unless necessary, use "obs1" brand
+	 * as a placeholder to maintain ftyp box size. */
+	if (fragmented && strcmp(major_brand, "iso6") != 0)
+		s_write(s, "iso6", 4);
+	else
+		s_write(s, "obs1", 4);
+
+	s_write(s, "iso2", 4);
+
+	/* Include H.264 brand if used */
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		if (track->type == TRACK_VIDEO) {
+			if (track->codec == CODEC_H264)
+				s_write(s, "avc1", 4);
+			break;
+		}
+	}
+
+	/* General MP4 brannd */
+	s_write(s, "mp41", 4);
+
+	return write_box_size(s, start);
+}
+
+/// 8.1.2 Free Space Box
+static size_t mp4_write_free(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	/* Write a 16-byte free box, so it can be replaced with a 64-bit size
+	 * box header (u32 + char[4] + u64) */
+	s_wb32(s, 16);
+	s_write(s, "free", 4);
+	s_wb64(s, 0);
+
+	return 16;
+}
+
+/// 8.2.2 Movie Header Box
+static size_t mp4_write_mvhd(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	size_t start = serializer_get_pos(s);
+
+	/* Use primary video track as the baseline for duration */
+	uint64_t duration = 0;
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		if (track->type == TRACK_VIDEO) {
+			duration = util_mul_div64(track->duration, 1000,
+						  track->timebase_den);
+			break;
+		}
+	}
+
+	write_fullbox(s, 0, "mvhd", 0, 0);
+
+	if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
+		s_wb64(s, mux->creation_time); // creation time
+		s_wb64(s, mux->creation_time); // modification time
+		s_wb32(s, 1000);               // timescale
+		s_wb64(s, duration);           // duration (0 for fragmented)
+	} else {
+		s_wb32(s, (uint32_t)mux->creation_time); // creation time
+		s_wb32(s, (uint32_t)mux->creation_time); // modification time
+		s_wb32(s, 1000);                         // timescale
+		s_wb32(s, (uint32_t)duration); // duration (0 for fragmented)
+	}
+
+	s_wb32(s, 0x00010000); // rate, 16.16 fixed float (1 << 16)
+	s_wb16(s, 0x0100);     // volume
+
+	s_wb16(s, 0); // reserved
+	s_wb32(s, 0); // reserved
+	s_wb32(s, 0); // reserved
+
+	// Matrix
+	for (int i = 0; i < 9; i++)
+		s_wb32(s, UNITY_MATRIX[i]);
+
+	// pre_defined
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+
+	s_wb32(s, mux->track_ctr + 1); // next_track_ID
+
+	return write_box_size(s, start);
+}
+
+/// 8.3.2 Track Header Box
+static size_t mp4_write_tkhd(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	size_t start = serializer_get_pos(s);
+
+	uint64_t duration =
+		util_mul_div64(track->duration, 1000, track->timebase_den);
+
+	/* Flags are 0x1 (enabled) | 0x2 (in movie) */
+	static const uint32_t flags = 0x1 | 0x2;
+	write_fullbox(s, 0, "tkhd", 0, flags);
+
+	if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
+		s_wb64(s, mux->creation_time); // creation time
+		s_wb64(s, mux->creation_time); // modification time
+		s_wb32(s, track->track_id);    // track_id
+		s_wb32(s, 0);                  // reserved
+		s_wb64(s, duration);           // duration in movie timescale
+	} else {
+		s_wb32(s, (uint32_t)mux->creation_time); // creation time
+		s_wb32(s, (uint32_t)mux->creation_time); // modification time
+		s_wb32(s, track->track_id);              // track_id
+		s_wb32(s, 0);                            // reserved
+		s_wb32(s, (uint32_t)duration); // duration in movie timescale
+	}
+
+	s_wb32(s, 0);                                      // reserved
+	s_wb32(s, 0);                                      // reserved
+	s_wb16(s, 0);                                      // layer
+	s_wb16(s, track->type == TRACK_AUDIO ? 1 : 0);     // alternate group
+	s_wb16(s, track->type == TRACK_AUDIO ? 0x100 : 0); // volume
+	s_wb16(s, 0);                                      // reserved
+
+	// Matrix (predefined)
+	for (int i = 0; i < 9; i++)
+		s_wb32(s, UNITY_MATRIX[i]);
+
+	if (track->type == TRACK_AUDIO) {
+		s_wb32(s, 0); // width
+		s_wb32(s, 0); // height
+	} else {
+		/* width/height are fixed point 16.16, so we just shift the
+		 * integer to the upper 16 bits */
+		uint32_t width = obs_encoder_get_width(track->encoder);
+		s_wb32(s, width << 16);
+		uint32_t height = obs_encoder_get_height(track->encoder);
+		s_wb32(s, height << 16);
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.4.2 Media Header Box
+static size_t mp4_write_mdhd(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+
+	size_t size = 32;
+	uint8_t version = 0;
+	uint64_t duration = track->duration;
+	uint32_t timescale = track->timescale;
+
+	if (track->type == TRACK_VIDEO) {
+		/* Update to track timescale */
+		duration = util_mul_div64(duration, track->timescale,
+					  track->timebase_den);
+	}
+
+	/* use 64-bit duration if necessary */
+	if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
+		size = 44;
+		version = 1;
+	}
+
+	write_fullbox(s, size, "mdhd", version, 0);
+
+	if (version == 1) {
+		s_wb64(s, mux->creation_time); // creation time
+		s_wb64(s, mux->creation_time); // modification time
+		s_wb32(s, timescale);          // timescale
+		s_wb64(s, (uint32_t)duration); // duration
+	} else {
+		s_wb32(s, (uint32_t)mux->creation_time); // creation time
+		s_wb32(s, (uint32_t)mux->creation_time); // modification time
+		s_wb32(s, timescale);                    // timescale
+		s_wb32(s, (uint32_t)duration);           // duration
+	}
+
+	s_wb16(s, 21956); // language (undefined)
+	s_wb16(s, 0);     // pre_defined
+
+	return size;
+}
+
+/// 8.4.3 Handler Reference Box
+static size_t mp4_write_hdlr(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "hdlr", 0, 0);
+
+	s_wb32(s, 0); // pre_defined
+
+	// handler_type
+	if (track->type == TRACK_VIDEO)
+		s_write(s, "vide", 4);
+	else if (track->type == TRACK_CHAPTERS)
+		s_write(s, "text", 4);
+	else
+		s_write(s, "soun", 4);
+
+	s_wb32(s, 0); // reserved
+	s_wb32(s, 0); // reserved
+	s_wb32(s, 0); // reserved
+
+	// name (utf-8 string, null terminated)
+	if (track->type == TRACK_VIDEO)
+		s_write(s, "OBS Video Handler", 18);
+	else if (track->type == TRACK_CHAPTERS)
+		s_write(s, "OBS Chapter Handler", 20);
+	else
+		s_write(s, "OBS Audio Handler", 18);
+
+	return write_box_size(s, start);
+}
+
+/// 12.1.2 Video media header
+static size_t mp4_write_vmhd(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	/* Flags is always 1 */
+	write_fullbox(s, 20, "vmhd", 0, 1);
+
+	s_wb16(s, 0); // graphicsmode
+	s_wb16(s, 0); // opcolor r
+	s_wb16(s, 0); // opcolor g
+	s_wb16(s, 0); // opcolor b
+
+	return 16;
+}
+
+/// 12.2.2 Sound media header
+static size_t mp4_write_smhd(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 16, "smhd", 0, 0);
+
+	s_wb16(s, 0); // balance
+	s_wb16(s, 0); // reserved
+
+	return 16;
+}
+
+/// (QTFF/Apple) Text media information atom
+static size_t mp4_write_qt_text(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "text");
+
+	/* Identity matrix, note that it's not fixed point 16.16 */
+	s_wb16(s, 0x01);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x01);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x00);
+	s_wb32(s, 0x00004000);
+	/* Seemingly undocumented */
+	s_wb16(s, 0x0000);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Base media info atom
+static size_t mp4_write_gmin(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "gmin", 0, 0);
+
+	s_wb16(s, 0x40);   // graphics mode
+	s_wb16(s, 0x8000); // opColor r
+	s_wb16(s, 0x8000); // opColor g
+	s_wb16(s, 0x8000); // opColor b
+	s_wb16(s, 0);      // balance
+	s_wb16(s, 0);      // reserved
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Base media information header atom
+static size_t mp4_write_gmhd(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "gmhd");
+
+	// gmin
+	mp4_write_gmin(mux);
+	// text (QuickTime)
+	mp4_write_qt_text(mux);
+
+	return write_box_size(s, start);
+}
+
+/// ISO/IEC 14496-15 5.4.2.1 AVCConfigurationBox
+static size_t mp4_write_avcC(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	/* For AVC this is the parsed extra data. */
+	uint8_t *header;
+	size_t size;
+
+	struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
+					.timebase_den = 1,
+					.keyframe = true};
+
+	if (!obs_encoder_get_extra_data(enc, &header, &size))
+		return 0;
+
+	packet.size = obs_parse_avc_header(&packet.data, header, size);
+
+	size_t box_size = packet.size + 8;
+	write_box(s, box_size, "avcC");
+	s_write(s, packet.data, packet.size);
+
+	bfree(packet.data);
+	return box_size;
+}
+
+/// ISO/IEC 14496-15 8.4.1.1 HEVCConfigurationBox
+static size_t mp4_write_hvcC(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	/* For HEVC this is the parsed extra data. */
+	uint8_t *header;
+	size_t size;
+
+	struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
+					.timebase_den = 1,
+					.keyframe = true};
+
+	if (!obs_encoder_get_extra_data(enc, &header, &size))
+		return 0;
+
+	packet.size = obs_parse_hevc_header(&packet.data, header, size);
+
+	size_t box_size = packet.size + 8;
+	write_box(s, box_size, "hvcC");
+	s_write(s, packet.data, packet.size);
+
+	bfree(packet.data);
+	return box_size;
+}
+
+/// AV1 ISOBMFF 2.3. AV1 Codec Configuration Box
+static size_t mp4_write_av1C(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	/* For AV1 this is just the parsed extra data. */
+	uint8_t *header;
+	size_t size;
+
+	struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
+					.timebase_den = 1,
+					.keyframe = true};
+
+	if (!obs_encoder_get_extra_data(enc, &header, &size))
+		return 0;
+
+	packet.size = obs_parse_av1_header(&packet.data, header, size);
+
+	size_t box_size = packet.size + 8;
+	write_box(s, box_size, "av1C");
+	s_write(s, packet.data, packet.size);
+
+	bfree(packet.data);
+	return box_size;
+}
+
+/// 12.1.5 Colour information
+static size_t mp4_write_colr(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	UNUSED_PARAMETER(enc);
+	struct serializer *s = mux->serializer;
+
+	write_box(s, 19, "colr");
+
+	uint8_t full_range = 0;
+	uint16_t pri, trc, spc;
+	pri = trc = spc = 0;
+	get_colour_information(enc, &pri, &trc, &spc, &full_range);
+
+	s_write(s, "nclx", 4);    // colour_type
+	s_wb16(s, pri);           // colour_primaries
+	s_wb16(s, trc);           // transfer_characteristics
+	s_wb16(s, spc);           // matrix_coefficiencts
+	s_w8(s, full_range << 7); // full range flag + 7 reserved bits (0)
+
+	return 19;
+}
+
+/// 12.1.4 Pixel Aspect Ratio
+static size_t mp4_write_pasp(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	write_box(s, 16, "pasp");
+
+	s_wb32(s, 1); // hSpacing
+	s_wb32(s, 1); // vSpacing
+
+	return 16;
+}
+
+/// 12.1.3 Visual Sample Entry
+static inline void mp4_write_visual_sample_entry(struct mp4_mux *mux,
+						 obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	// SampleEntry Box
+	s_w8(s, 0); // reserved
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+
+	s_wb16(s, 1); // data_reference_index
+
+	// VisualSampleEntry Box
+	s_wb16(s, 0); // pre_defined
+	s_wb16(s, 0); // reserved
+	s_wb32(s, 0); // pre_defined
+	s_wb32(s, 0); // pre_defined
+	s_wb32(s, 0); // pre_defined
+
+	s_wb16(s, (uint16_t)obs_encoder_get_width(enc));  // width
+	s_wb16(s, (uint16_t)obs_encoder_get_height(enc)); // height
+
+	s_wb32(s, 0x00480000); // horizresolution (predefined)
+	s_wb32(s, 0x00480000); // vertresolution (predefined)
+
+	s_wb32(s, 0); // reserved
+	s_wb16(s, 1); // frame_count
+
+	/* Name is fixed 32-bytes and needs to be padded to that length.
+	 * First byte is the length, rest is a string sans NULL terminator. */
+	char compressor_name[32] = {0};
+	const char *enc_id = obs_encoder_get_id(enc);
+	if (enc_id) {
+		size_t len = strlen(enc_id);
+		if (len > 31)
+			len = 31;
+
+		compressor_name[0] = (char)len;
+		memcpy(compressor_name + 1, enc_id, len);
+	}
+	s_write(s, compressor_name, sizeof(compressor_name)); // compressorname
+
+	s_wb16(s, 0x0018); // depth
+	s_wb16(s, -1);     // pre_defined
+}
+
+/// 12.1.6 Content light level
+static size_t mp4_write_clli(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	video_t *video = obs_encoder_video(enc);
+	const struct video_output_info *info = video_output_get_info(video);
+
+	/* Only write box for HDR video */
+	if (info->colorspace != VIDEO_CS_2100_PQ &&
+	    info->colorspace != VIDEO_CS_2100_HLG)
+		return 0;
+
+	write_box(s, 12, "clli");
+
+	float nominal_peak = obs_get_video_hdr_nominal_peak_level();
+
+	s_wb16(s, (uint16_t)nominal_peak); // max_content_light_level
+	s_wb16(s, (uint16_t)nominal_peak); // max_pic_average_light_level
+
+	return 12;
+}
+
+/// 12.1.7 Mastering display colour volume
+static size_t mp4_write_mdcv(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+
+	video_t *video = obs_encoder_video(enc);
+	const struct video_output_info *info = video_output_get_info(video);
+
+	// Only write atom for HDR video
+	if (info->colorspace != VIDEO_CS_2100_PQ &&
+	    info->colorspace != VIDEO_CS_2100_HLG)
+		return 0;
+
+	write_box(s, 32, "mdcv");
+
+	float nominal_peak = obs_get_video_hdr_nominal_peak_level();
+	uint32_t max_lum = (uint32_t)nominal_peak * 10000;
+
+	/* Note that these values are hardcoded everywhere in OBS, so these are
+	 * just the same as used in our other muxers/encoders. */
+
+	// 3 x display_primaries (x, y) pairs
+	s_wb16(s, 13250);
+	s_wb16(s, 34500);
+	s_wb16(s, 7500);
+	s_wb16(s, 3000);
+	s_wb16(s, 34000);
+	s_wb16(s, 16000);
+
+	s_wb16(s, 15635);   // white_point_x
+	s_wb16(s, 16450);   // white_point_y
+	s_wb32(s, max_lum); // max_display_mastering_luminance
+	s_wb32(s, 0);       // min_display_mastering_luminance
+
+	return 32;
+}
+
+/// ISO/IEC 14496-15 5.4.2.1 AVCSampleEntry
+static size_t mp4_write_avc1(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "avc1");
+
+	mp4_write_visual_sample_entry(mux, enc);
+
+	// avcC
+	mp4_write_avcC(mux, enc);
+
+	// colr
+	mp4_write_colr(mux, enc);
+
+	// pasp
+	mp4_write_pasp(mux);
+
+	return write_box_size(s, start);
+}
+
+/// ISO/IEC 14496-15 8.4.1.1 HEVCSampleEntry
+static size_t mp4_write_hvc1(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "hvc1");
+
+	mp4_write_visual_sample_entry(mux, enc);
+
+	// avcC
+	mp4_write_hvcC(mux, enc);
+
+	// colr
+	mp4_write_colr(mux, enc);
+
+	// clli
+	mp4_write_clli(mux, enc);
+
+	// mdcv
+	mp4_write_mdcv(mux, enc);
+
+	// pasp
+	mp4_write_pasp(mux);
+
+	return write_box_size(s, start);
+}
+
+/// AV1 ISOBMFF 2.2. AV1 Sample Entry
+static size_t mp4_write_av01(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "av01");
+
+	mp4_write_visual_sample_entry(mux, enc);
+
+	// avcC
+	mp4_write_av1C(mux, enc);
+
+	// colr
+	mp4_write_colr(mux, enc);
+
+	// clli
+	mp4_write_clli(mux, enc);
+
+	// mdcv
+	mp4_write_mdcv(mux, enc);
+
+	// pasp
+	mp4_write_pasp(mux);
+
+	return write_box_size(s, start);
+}
+
+static inline void put_descr(struct serializer *s, uint8_t tag, size_t size)
+{
+	int i = 3;
+	s_w8(s, tag);
+	for (; i > 0; i--)
+		s_w8(s, (uint8_t)((size >> (7 * i)) | 0x80));
+	s_w8(s, size & 0x7F);
+}
+
+/// ISO/IEC 14496-14 5.6 ESDBox
+static size_t mp4_write_esds(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "esds", 0, 0);
+
+	/* Encoder extradata will be used as DecoderSpecificInfo  */
+	uint8_t *extradata;
+	size_t extradata_size;
+	if (!obs_encoder_get_extra_data(track->encoder, &extradata,
+					&extradata_size)) {
+		extradata_size = 0;
+	}
+
+	/// ISO/IEC 14496-1
+
+	// ES_Descriptor
+	size_t decoder_specific_info_len = extradata_size ? extradata_size + 5
+							  : 0;
+
+	put_descr(s, 0x03, 3 + 5 + 13 + decoder_specific_info_len + 5 + 1);
+	s_wb16(s, track->track_id);
+	s_w8(s, 0x00); // flags
+
+	// DecoderConfigDescriptor
+	put_descr(s, 0x04, 13 + decoder_specific_info_len);
+	s_w8(s, 0x40); // codec tag, 0x40 = AAC
+	s_w8(s, 0x15); // stream type field (0x15 = audio stream)
+
+	/* When writing the final MOOV this could theoretically be calculated
+	 * based on chunks, but it's not really all that important. */
+	uint32_t bitrate = 0;
+	obs_data_t *settings = obs_encoder_get_settings(track->encoder);
+	if (settings) {
+		int64_t enc_bitrate = obs_data_get_int(settings, "bitrate");
+		if (enc_bitrate)
+			bitrate = (uint32_t)(enc_bitrate * 1000);
+
+		obs_data_release(settings);
+	}
+
+	s_wb24(s, 0);       // bufferSizeDB (in bytes)
+	s_wb32(s, bitrate); // maxbitrate
+	s_wb32(s, bitrate); // avgBitrate
+
+	// DecoderSpecificInfo
+	if (extradata_size) {
+		put_descr(s, 0x05, extradata_size);
+		s_write(s, extradata, extradata_size);
+	}
+
+	// SLConfigDescriptor descriptor
+	put_descr(s, 0x06, 1);
+	s_w8(s, 0x02); // 0x2 = reserved for MP4, descriptor is empty
+
+	return write_box_size(s, start);
+}
+
+/// 12.2.3 Audio Sample Entry
+static inline void mp4_write_audio_sample_entry(struct mp4_mux *mux,
+						struct mp4_track *track,
+						uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+
+	// SampleEntry Box
+	s_w8(s, 0); // reserved
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+	s_w8(s, 0);
+
+	s_wb16(s, 1); // data_reference_index
+
+	// AudioSampleEntry Box
+	if (version == 1) {
+		s_wb16(s, 1); // entry_version
+		s_wb16(s, 0); // reserved
+		s_wb16(s, 0); // reserved
+		s_wb16(s, 0); // reserved
+	} else {
+		s_wb32(s, 0); // reserved
+		s_wb32(s, 0); // reserved
+	}
+
+	audio_t *audio = obs_encoder_audio(track->encoder);
+	size_t channels = audio_output_get_channels(audio);
+	uint32_t sample_rate = track->timescale;
+	bool alac = track->codec == CODEC_ALAC;
+
+	s_wb16(s, (uint32_t)channels); // channelcount
+
+	/* OBS FLAC is currently always 16 bit, ALAC always 24, this may change
+	 * in the futrure and should be handled differently then.
+	 * That being said thoes codecs are self-describing so in most cases it
+	 * shouldn't matter either way. */
+	s_wb16(s, alac ? 24 : 16); // samplesize
+
+	s_wb16(s, 0); // pre_defined
+	s_wb16(s, 0); // reserved
+
+	s_wb32(s, sample_rate << 16); // samplerate
+}
+
+/// 12.2.4 Channel layout
+static size_t mp4_write_chnl(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "chnl", 0, 0);
+
+	audio_t *audio = obs_encoder_audio(track->encoder);
+	const struct audio_output_info *info = audio_output_get_info(audio);
+
+	s_w8(s, 1); // stream_structure (1 = channels)
+
+	/* 5.1 and 4.1 do not have a corresponding ISO layout, so we have to
+	 * write a manually created channel map for those. */
+	uint8_t map[8] = {0};
+	uint8_t items = 0;
+	uint8_t defined_layout = 0;
+
+	get_speaker_positions(info->speakers, map, &items, &defined_layout);
+
+	if (!defined_layout) {
+		warn("No ISO layout available for speaker layout %d, "
+		     "this may not be supported by all applications!",
+		     info->speakers);
+		s_w8(s, 0);             // definedLayout
+		s_write(s, map, items); // uint8_t speaker_position[count]
+	} else {
+		s_w8(s, defined_layout); // definedLayout
+		s_wb64(s, 0);            // ommitedChannelMap
+	}
+
+	return write_box_size(s, start);
+}
+
+/// ISO/IEC 14496-14 5.6 MP4AudioSampleEntry
+static size_t mp4_write_mp4a(struct mp4_mux *mux, struct mp4_track *track,
+			     uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "mp4a");
+
+	mp4_write_audio_sample_entry(mux, track, version);
+
+	// esds
+	mp4_write_esds(mux, track);
+
+	/* Write channel layout for version 1 sample entires */
+	if (version == 1)
+		mp4_write_chnl(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// Encapsulation of FLAC in ISO Base Media File Format 3.3.2 FLAC Specific Box
+static size_t mp4_write_dfLa(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	uint8_t *extradata;
+	size_t extradata_size;
+
+	if (!obs_encoder_get_extra_data(track->encoder, &extradata,
+					&extradata_size))
+		return 0;
+
+	write_fullbox(s, 0, "dfLa", 0, 0);
+
+	/// FLACMetadataBlock
+
+	// LastMetadataBlockFlag (1) | BlockType (0)
+	s_w8(s, 1 << 7 | 0);
+	// Length
+	s_wb24(s, (uint32_t)extradata_size);
+	// BlockData[Length]
+	s_write(s, extradata, extradata_size);
+
+	return write_box_size(s, start);
+}
+
+/// Encapsulation of FLAC in ISO Base Media File Format 3.3.1 FLACSampleEntry
+static size_t mp4_write_fLaC(struct mp4_mux *mux, struct mp4_track *track,
+			     uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "fLaC");
+
+	mp4_write_audio_sample_entry(mux, track, version);
+
+	// dfLa
+	mp4_write_dfLa(mux, track);
+
+	if (version == 1)
+		mp4_write_chnl(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// Apple Lossless Format "Magic Cookie" Description - MP4/M4A File
+static size_t mp4_write_alac(struct mp4_mux *mux, struct mp4_track *track,
+			     uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	uint8_t *extradata;
+	size_t extradata_size;
+
+	if (!obs_encoder_get_extra_data(track->encoder, &extradata,
+					&extradata_size))
+		return 0;
+
+	write_box(s, 0, "alac");
+
+	mp4_write_audio_sample_entry(mux, track, version);
+
+	/* Apple Lossless Magic Cookie */
+	s_write(s, extradata, extradata_size);
+
+	if (version == 1)
+		mp4_write_chnl(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// ISO/IEC 23003-5 5.1 PCM configuration
+static size_t mp4_write_pcmc(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "pcmC", 0, 0);
+
+	s_w8(s, 1); // endianness, 1 = little endian
+
+	// bits per sample
+	if (track->codec == CODEC_PCM_I16)
+		s_w8(s, 16);
+	else if (track->codec == CODEC_PCM_I24)
+		s_w8(s, 24);
+	else if (track->codec == CODEC_PCM_F32)
+		s_w8(s, 32);
+
+	return write_box_size(s, start);
+}
+
+/// ISO/IEC 23003-5 5.1 PCM configuration
+static size_t mp4_write_xpcm(struct mp4_mux *mux, struct mp4_track *track,
+			     uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	/* Different box types for floating point and integer PCM*/
+	write_box(s, 0, track->codec == CODEC_PCM_F32 ? "fpcm" : "ipcm");
+
+	mp4_write_audio_sample_entry(mux, track, version);
+
+	/* ChannelLayout (chnl) is required for PCM */
+	mp4_write_chnl(mux, track);
+
+	// pcmc
+	mp4_write_pcmc(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Text sample description
+static size_t mp4_write_text(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "text", 0, 0);
+
+	s_wb32(s, 1); // number of entries
+
+	/* Preset sample description as used by FFmpeg. */
+	s_write(s, &TEXT_STUB_HEADER, sizeof(TEXT_STUB_HEADER));
+
+	return write_box_size(s, start);
+}
+
+static inline uint32_t rl32(const uint8_t *ptr)
+{
+	return (ptr[3] << 24) + (ptr[2] << 16) + (ptr[1] << 8) + ptr[0];
+}
+
+static inline uint16_t rl16(const uint8_t *ptr)
+{
+	return (ptr[1] << 8) + ptr[0];
+}
+
+/// Encapsulation of Opus in ISO Base Media File Format 4.3.2 Opus Specific Box
+static size_t mp4_write_dOps(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	uint8_t *extradata;
+	size_t extradata_size;
+
+	if (!obs_encoder_get_extra_data(track->encoder, &extradata,
+					&extradata_size))
+		return 0;
+
+	write_box(s, 0, "dOps");
+	s_w8(s, 0); // version
+
+	uint8_t channels = *(extradata + 9);
+	uint8_t channel_map = *(extradata + 18);
+
+	s_w8(s, channels); // channel count
+	// OpusHead is little-endian, but MP4 is big-endian, so we have to swap them here
+	s_wb16(s, rl16(extradata + 10)); // pre-skip
+	s_wb32(s, rl32(extradata + 12)); // input sample rate
+	s_wb16(s, rl16(extradata + 16)); // output gain
+	s_w8(s, channel_map);            // channel mapping family
+
+	if (channel_map)
+		s_write(s, extradata + 19, 2 + channels);
+
+	return write_box_size(s, start);
+}
+
+/// Encapsulation of Opus in ISO Base Media File Format 4.3.1 Sample entry format
+static size_t mp4_write_Opus(struct mp4_mux *mux, struct mp4_track *track,
+			     uint8_t version)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "Opus");
+
+	mp4_write_audio_sample_entry(mux, track, version);
+
+	// dOps
+	mp4_write_dOps(mux, track);
+
+	if (version == 1)
+		mp4_write_chnl(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// 8.5.2 Sample Description Box
+static size_t mp4_write_stsd(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	/* Anything but mono or stereo technically requires v1,
+	 * but in practice that doesn't appear to matter. */
+	uint8_t version = 0;
+
+	if (track->type == TRACK_AUDIO) {
+		audio_t *audio = obs_encoder_audio(track->encoder);
+		version = audio_output_get_channels(audio) > 2 ? 1 : 0;
+	}
+
+	write_fullbox(s, 0, "stsd", version, 0);
+
+	s_wb32(s, 1); // entry_count
+
+	// codec specific boxes
+	if (track->type == TRACK_VIDEO) {
+		if (track->codec == CODEC_H264)
+			mp4_write_avc1(mux, track->encoder);
+		else if (track->codec == CODEC_HEVC)
+			mp4_write_hvc1(mux, track->encoder);
+		else if (track->codec == CODEC_AV1)
+			mp4_write_av01(mux, track->encoder);
+	} else if (track->type == TRACK_AUDIO) {
+		if (track->codec == CODEC_AAC)
+			mp4_write_mp4a(mux, track, version);
+		else if (track->codec == CODEC_OPUS)
+			mp4_write_Opus(mux, track, version);
+		else if (track->codec == CODEC_FLAC)
+			mp4_write_fLaC(mux, track, version);
+		else if (track->codec == CODEC_ALAC)
+			mp4_write_alac(mux, track, version);
+		else if (track->codec == CODEC_PCM_I16 ||
+			 track->codec == CODEC_PCM_I24 ||
+			 track->codec == CODEC_PCM_F32)
+			mp4_write_xpcm(mux, track, version);
+	} else if (track->type == TRACK_CHAPTERS) {
+		mp4_write_text(mux);
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.6.1.2 Decoding Time to Sample Box
+static size_t mp4_write_stts(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+
+	if (fragmented) {
+		write_fullbox(s, 16, "stts", 0, 0);
+		s_wb32(s, 0); // entry_count
+		return 16;
+	}
+
+	int64_t start = serializer_get_pos(s);
+	struct sample_delta *arr = track->deltas.array;
+	size_t num = track->deltas.num;
+
+	write_fullbox(s, 0, "stts", 0, 0);
+
+	s_wb32(s, (uint32_t)num); // entry_count
+
+	for (size_t idx = 0; idx < num; idx++) {
+		struct sample_delta *smp = &arr[idx];
+
+		uint64_t delta = util_mul_div64(smp->delta, track->timescale,
+						track->timebase_den);
+
+		s_wb32(s, smp->count);      // sample_count
+		s_wb32(s, (uint32_t)delta); // sample_delta
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.6.2 Sync Sample Box
+static size_t mp4_write_stss(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	uint32_t num = (uint32_t)track->sync_samples.num;
+
+	if (!num)
+		return 0;
+
+	/* 16 byte FullBox header + 4-bytes (u32) per sync sample */
+	uint32_t size = 16 + 4 * num;
+
+	write_fullbox(s, size, "stss", 0, 0);
+	s_wb32(s, num); // entry_count
+
+	for (size_t idx = 0; idx < num; idx++)
+		s_wb32(s, track->sync_samples.array[idx]); // sample_number
+
+	return size;
+}
+
+/// 8.6.1.3 Composition Time to Sample Box
+static size_t mp4_write_ctts(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	uint32_t num = (uint32_t)track->offsets.num;
+
+	uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0;
+
+	/* 16 byte FullBox header + 8-bytes (u32+u32/i32) per offset entry */
+	uint32_t size = 16 + 8 * num;
+	write_fullbox(s, size, "ctts", version, 0);
+
+	s_wb32(s, num); // entry_count
+
+	for (size_t idx = 0; idx < num; idx++) {
+		int64_t offset = (int64_t)track->offsets.array[idx].offset *
+				 (int64_t)track->timescale /
+				 (int64_t)track->timebase_den;
+
+		s_wb32(s, track->offsets.array[idx].count); // sample_count
+		s_wb32(s, (uint32_t)offset);                // sample_offset
+	}
+
+	return size;
+}
+
+/// 8.7.4 Sample To Chunk Box
+static size_t mp4_write_stsc(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+
+	if (fragmented) {
+		write_fullbox(s, 16, "stsc", 0, 0);
+		s_wb32(s, 0); // entry_count
+		return 16;
+	}
+
+	struct chunk *arr = track->chunks.array;
+	size_t arr_num = track->chunks.num;
+
+	/* Compress into array with counter for repeating chunk sizes */
+	DARRAY(struct chunk_run {
+		uint32_t first;
+		uint32_t samples;
+	}) chunk_runs;
+
+	da_init(chunk_runs);
+
+	for (size_t idx = 0; idx < arr_num; idx++) {
+		struct chunk *chk = &arr[idx];
+
+		if (!chunk_runs.num ||
+		    chunk_runs.array[chunk_runs.num - 1].samples !=
+			    chk->samples) {
+			struct chunk_run *cr = da_push_back_new(chunk_runs);
+			cr->samples = chk->samples;
+			cr->first = (uint32_t)idx + 1; // ISO-BMFF is 1-indexed
+		}
+	}
+
+	uint32_t num = (uint32_t)chunk_runs.num;
+
+	/* 16 byte FullBox header + 12-bytes (u32+u32+u32) per chunk run */
+	uint32_t size = 16 + 12 * num;
+	write_fullbox(s, size, "stsc", 0, 0);
+
+	s_wb32(s, num); // entry_count
+
+	for (size_t idx = 0; idx < num; idx++) {
+		struct chunk_run *cr = &chunk_runs.array[idx];
+		s_wb32(s, cr->first);   // first_chunk
+		s_wb32(s, cr->samples); // samples_per_chunk
+		s_wb32(s, 1);           // sample_description_index
+	}
+
+	da_free(chunk_runs);
+
+	return size;
+}
+
+/// 8.7.3 Sample Size Boxes
+static size_t mp4_write_stsz(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+
+	if (fragmented) {
+		write_fullbox(s, 20, "stsz", 0, 0);
+		s_wb32(s, 0); // sample_size
+		s_wb32(s, 0); // sample_count
+
+		return 20;
+	}
+
+	int64_t start = serializer_get_pos(s);
+
+	/* This should only ever happen when recording > 24 hours of
+	 * 48 kHz PCM audio or 828 days of 60 FPS video. */
+	if (track->samples > UINT32_MAX) {
+		warn("Track %u has too many samples, its duration may not be "
+		     "read correctly. Remuxing the file to another format such "
+		     "as MKV may be required.",
+		     track->track_id);
+	}
+
+	write_fullbox(s, 0, "stsz", 0, 0);
+
+	if (track->sample_size) {
+		/* Fixed size samples mean we don't need an array */
+		s_wb32(s, track->sample_size);       // sample_size
+		s_wb32(s, (uint32_t)track->samples); // sample_count
+	} else {
+		s_wb32(s, 0);                                 // sample_size
+		s_wb32(s, (uint32_t)track->sample_sizes.num); // sample_count
+
+		for (size_t idx = 0; idx < track->sample_sizes.num; idx++) {
+			s_wb32(s, track->sample_sizes.array[idx]); // entry_size
+		}
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.7.5 Chunk Offset Box
+static size_t mp4_write_stco(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+
+	if (fragmented) {
+		write_fullbox(s, 16, "stco", 0, 0);
+		s_wb32(s, 0); // entry_count
+		return 16;
+	}
+
+	struct chunk *arr = track->chunks.array;
+	uint32_t num = (uint32_t)track->chunks.num;
+
+	uint64_t last_off = arr[num - 1].offset;
+	uint32_t size;
+	bool co64 = last_off > UINT32_MAX;
+
+	/* When using 64-bit offsets we write 8-bytes (u64) per chunk,
+	 * otherwise 4-bytes (u32). */
+	if (co64) {
+		size = 16 + 8 * num;
+		write_fullbox(s, size, "co64", 0, 0);
+	} else {
+		size = 16 + 4 * num;
+		write_fullbox(s, size, "stco", 0, 0);
+	}
+
+	s_wb32(s, num); // entry_count
+
+	for (size_t idx = 0; idx < num; idx++) {
+		if (co64)
+			s_wb64(s, arr[idx].offset); // chunk_offset
+		else
+			s_wb32(s, (uint32_t)arr[idx].offset); // chunk_offset
+	}
+
+	return size;
+}
+
+/// 8.9.3 Sample Group Description Box
+static size_t mp4_write_sgpd_aac(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	int64_t start = serializer_get_pos(s);
+	write_fullbox(s, 0, "sgpd", 1, 0);
+
+	s_write(s, "roll", 4); // grouping_tpye
+	s_wb32(s, 2);          // default_length (i16)
+
+	s_wb32(s, 1); // entry_count
+
+	// AudioRollRecoveryEntry
+	s_wb16(s, -1); // roll_distance
+
+	return write_box_size(s, start);
+}
+
+/// 8.9.2 Sample to Group Box
+static size_t mp4_write_sbgp_aac(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+
+	int64_t start = serializer_get_pos(s);
+	write_fullbox(s, 0, "sbgp", 0, 0);
+
+	/// 10.1 AudioRollRecoveryEntry
+	s_write(s, "roll", 4); // grouping_tpye
+
+	s_wb32(s, 1); // entry_count
+
+	s_wb32(s, (uint32_t)track->samples); // sample_count
+	s_wb32(s, 1);                        // group_description_index
+
+	return write_box_size(s, start);
+}
+
+static size_t mp4_write_sbgp_sbgp_opus(struct mp4_mux *mux,
+				       struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	/// 8.9.3 Sample Group Description Box
+	write_fullbox(s, 0, "sgpd", 1, 0);
+
+	s_write(s, "roll", 4); // grouping_tpye
+	s_wb32(s, 2);          // default_length (i16)
+
+	/* Opus requires 80 ms of preroll, which at 48 kHz is 3840 PCM samples */
+	const int64_t opus_preroll = 3840;
+
+	/* Compute the preroll samples (should be 4, each being 20 ms) */
+	uint16_t preroll_count = 0;
+	int64_t preroll_remaining = opus_preroll;
+
+	for (size_t i = 0; i < track->deltas.num && preroll_remaining > 0;
+	     i++) {
+		for (uint32_t j = 0;
+		     j < track->deltas.array[i].count && preroll_remaining > 0;
+		     j++) {
+			preroll_remaining -= track->deltas.array[i].delta;
+			preroll_count++;
+		}
+	}
+
+	s_wb32(s, 1); // entry_count
+	/// 10.1 AudioRollRecoveryEntry
+	s_wb16(s, -preroll_count); // roll_distance
+
+	size_t size_sgpd = write_box_size(s, start);
+
+	/* --------------- */
+
+	/// 8.9.2 Sample to Group Box
+	start = serializer_get_pos(s);
+	write_fullbox(s, 0, "sbgp", 0, 0);
+
+	s_write(s, "roll", 4); // grouping_tpye
+	s_wb32(s, 2);          // entry_count
+
+	// entry 0
+	s_wb32(s, preroll_count); // sample_count
+	s_wb32(s, 0);             // group_description_index
+	// entry 1
+	s_wb32(s, (uint32_t)track->samples - preroll_count); // sample_count
+	s_wb32(s, 1); // group_description_index
+
+	return size_sgpd + write_box_size(s, start);
+}
+
+/// 8.5.1 Sample Table Box
+static size_t mp4_write_stbl(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "stbl");
+
+	// stsd
+	mp4_write_stsd(mux, track);
+
+	// stts
+	mp4_write_stts(mux, track, fragmented);
+
+	// stss (non-fragmented only)
+	if (track->type == TRACK_VIDEO && !fragmented)
+		mp4_write_stss(mux, track);
+
+	// ctts (non-fragmented only)
+	if (track->needs_ctts && !fragmented)
+		mp4_write_ctts(mux, track);
+
+	// stsc
+	mp4_write_stsc(mux, track, fragmented);
+
+	// stsz
+	mp4_write_stsz(mux, track, fragmented);
+
+	// stco
+	mp4_write_stco(mux, track, fragmented);
+
+	if (!fragmented) {
+		/* AAC and Opus require a pre-roll to get correct decoder
+		 * output, sgpd and sbgp are used to create a "roll" group. */
+		if (track->codec == CODEC_AAC) {
+			// sgpd
+			mp4_write_sgpd_aac(mux);
+			// sbgp
+			mp4_write_sbgp_aac(mux, track);
+		} else if (track->codec == CODEC_OPUS) {
+			// sgpd + sbgp
+			mp4_write_sbgp_sbgp_opus(mux, track);
+		}
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.7.2.2 DataEntryUrlBox
+static size_t mp4_write_url(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "url ", 0, 1);
+
+	/* empty, flag 1 means data is in this file */
+
+	return write_box_size(s, start);
+}
+
+/// 8.7.2 Data Reference Box
+static size_t mp4_write_dref(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "dref ", 0, 0);
+
+	s_wb32(s, 1); // entry_count
+
+	mp4_write_url(mux);
+
+	return write_box_size(s, start);
+}
+
+/// 8.7.1 Data Information Box
+static size_t mp4_write_dinf(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "dinf");
+
+	mp4_write_dref(mux);
+
+	return write_box_size(s, start);
+}
+
+/// 8.4.4 Media Information Box
+static size_t mp4_write_minf(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "minf");
+
+	// vmhd/smhd/gmhd
+	if (track->type == TRACK_VIDEO)
+		mp4_write_vmhd(mux);
+	else if (track->type == TRACK_CHAPTERS)
+		mp4_write_gmhd(mux);
+	else
+		mp4_write_smhd(mux);
+
+	// dinf, unnecessary but mandatory
+	mp4_write_dinf(mux);
+
+	// stbl
+	mp4_write_stbl(mux, track, fragmented);
+
+	return write_box_size(s, start);
+}
+
+/// 8.4.1 Media Box
+static size_t mp4_write_mdia(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "mdia");
+
+	// mdhd
+	mp4_write_mdhd(mux, track);
+
+	// hdlr
+	mp4_write_hdlr(mux, track);
+
+	// minf
+	mp4_write_minf(mux, track, fragmented);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) User data atom
+static size_t mp4_write_udta_atom(struct mp4_mux *mux, const char tag[4],
+				  const char *val)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, tag);
+	s_write(s, val, strlen(val));
+
+	return write_box_size(s, start);
+}
+
+/// 8.10.1 User Data Box
+static size_t mp4_write_track_udta(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "udta");
+
+	/* Our udta box contains QuickTime format user data atoms, which are
+	 * simple key-value pairs. Some are prefixed with 0xa9. */
+
+	const char *name = obs_encoder_get_name(track->encoder);
+	if (name)
+		mp4_write_udta_atom(mux, "name", name);
+
+	if (mux->flags & MP4_WRITE_ENCODER_INFO) {
+		const char *id = obs_encoder_get_id(track->encoder);
+		if (name)
+			mp4_write_udta_atom(mux, "\251enc", id);
+
+		obs_data_t *settings = obs_encoder_get_settings(track->encoder);
+		if (settings) {
+			const char *json =
+				obs_data_get_json_with_defaults(settings);
+			mp4_write_udta_atom(mux, "json", json);
+			obs_data_release(settings);
+		}
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.6.6 Edit List Box
+static size_t mp4_write_elst(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "elst", 0, 0);
+
+	s_wb32(s, 1); // entry count
+
+	uint64_t duration =
+		util_mul_div64(track->duration, 1000, track->timebase_den);
+	uint64_t delay = 0;
+
+	if (track->type == TRACK_VIDEO &&
+	    !(mux->flags & MP4_USE_NEGATIVE_CTS)) {
+		/* Compensate for frame-reordering delay (for example, when
+		 * using b-frames). */
+		int64_t dts_offset = 0;
+
+		if (track->offsets.num) {
+			struct sample_offset sample = track->offsets.array[0];
+			dts_offset = sample.offset;
+		} else if (track->packets.size) {
+			/* If no offset data exists yet (i.e. when writing the
+			 * incomplete moov in a fragmented file) use the raw
+			 * data from the current queued packets instead. */
+			struct encoder_packet pkt;
+			deque_peek_front(&track->packets, &pkt, sizeof(pkt));
+			dts_offset = pkt.pts - pkt.dts;
+		}
+
+		delay = util_mul_div64(dts_offset, track->timescale,
+				       track->timebase_den);
+	} else if (track->type == TRACK_AUDIO && track->first_pts < 0) {
+		delay = util_mul_div64(llabs(track->first_pts),
+				       track->timescale, track->timebase_den);
+		/* Subtract priming delay from total duration */
+		duration -= util_mul_div64(delay, 1000, track->timescale);
+	}
+
+	s_wb32(s, (uint32_t)duration); // segment_duration (movie timescale)
+	s_wb32(s, (uint32_t)delay);    // media_time (track timescale)
+	s_wb32(s, 1 << 16);            // media_rate
+
+	return write_box_size(s, start);
+}
+
+/// 8.6.5 Edit Box
+static size_t mp4_write_edts(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "edts");
+
+	mp4_write_elst(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// 8.3.3.2 TrackReferenceTypeBox
+static size_t mp4_write_chap(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	/// QTFF/Apple chapter track reference
+	write_box(s, 0, "chap");
+
+	s_wb32(s, mux->chapter_track->track_id);
+
+	return write_box_size(s, start);
+}
+
+/// 8.3.3 Track Reference Box
+static size_t mp4_write_tref(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "tref");
+
+	mp4_write_chap(mux);
+
+	return write_box_size(s, start);
+}
+
+/// 8.3.1 Track Box
+static size_t mp4_write_trak(struct mp4_mux *mux, struct mp4_track *track,
+			     bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "trak");
+
+	// tkhd
+	mp4_write_tkhd(mux, track);
+
+	// edts
+	mp4_write_edts(mux, track);
+
+	// tref
+	if (mux->chapter_track && track->type != TRACK_CHAPTERS)
+		mp4_write_tref(mux);
+
+	// mdia
+	mp4_write_mdia(mux, track, fragmented);
+
+	// udta (audio track name mainly)
+	mp4_write_track_udta(mux, track);
+
+	return write_box_size(s, start);
+}
+
+/// 8.8.3 Track Extends Box
+static size_t mp4_write_trex(struct mp4_mux *mux, uint32_t track_id)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 32, "trex", 0, 0);
+
+	s_wb32(s, track_id); // track_ID
+	s_wb32(s, 1);        // default_sample_description_index
+	s_wb32(s, 0);        // default_sample_duration
+	s_wb32(s, 0);        // default_sample_size
+	s_wb32(s, 0);        // default_sample_flags
+
+	return 32;
+}
+
+/// 8.8.1 Movie Extends Box
+static size_t mp4_write_mvex(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "mvex");
+
+	for (size_t track_id = 0; track_id < mux->tracks.num; track_id++)
+		mp4_write_trex(mux, (uint32_t)(track_id + 1));
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Undocumented QuickTime/iTunes metadata handler
+static size_t mp4_write_itunes_hdlr(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 33, "hdlr", 0, 0);
+
+	s_wb32(s, 0);          // pre_defined
+	s_write(s, "mdir", 4); // handler_type
+
+	// reserved
+	s_write(s, "appl", 4);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+
+	s_w8(s, 0); // name (NULL)
+
+	return 33;
+}
+
+/// (QTFF/Apple) Data atom
+static size_t mp4_write_data_atom(struct mp4_mux *mux, const char *data)
+{
+	struct serializer *s = mux->serializer;
+
+	size_t len = strlen(data);
+	uint32_t size = 16 + (uint32_t)len;
+
+	write_box(s, size, "data");
+
+	s_wb32(s, 1); // type, 1 = utf-8 string
+	s_wb32(s, 0); // locale, 0 = default
+	s_write(s, data, len);
+
+	return size;
+}
+
+/// (QTFF/Apple) Metadata item atom
+static size_t mp4_write_ilst_item_atom(struct mp4_mux *mux, const char name[4],
+				       const char *value)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, name);
+
+	mp4_write_data_atom(mux, value);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Metadata item list atom
+static size_t mp4_write_ilst(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	struct dstr value = {0};
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "ilst");
+
+	/* Encoder name */
+	dstr_cat(&value, "OBS Studio (");
+	dstr_cat(&value, obs_get_version_string());
+	dstr_cat(&value, ")");
+	/* Some QuickTime keys are prefixed with 0xa9 */
+	mp4_write_ilst_item_atom(mux, "\251too", value.array);
+
+	dstr_free(&value);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Key value metadata handler
+static size_t mp4_write_mdta_hdlr(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 33, "hdlr", 0, 0);
+
+	s_wb32(s, 0);          // pre_defined
+	s_write(s, "mdta", 4); // handler_type
+
+	// reserved
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+	s_wb32(s, 0);
+
+	s_w8(s, 0); // name (NULL)
+	return 33;
+}
+
+/// (QTFF/Apple) Metadata item keys atom
+static size_t mp4_write_mdta_keys(struct mp4_mux *mux, obs_data_t *meta)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "keys", 0, 0);
+
+	uint32_t count = 0;
+	int64_t count_pos = serializer_get_pos(s);
+	s_wb32(s, count); // count
+
+	obs_data_item_t *item = obs_data_first(meta);
+
+	for (; item != NULL; obs_data_item_next(&item)) {
+		const char *name = obs_data_item_get_name(item);
+		size_t len = strlen(name);
+
+		/* name is key type, can be udta or mdta */
+		write_box(s, len + 8, "mdta");
+		s_write(s, name, len); // key name
+
+		count++;
+	}
+
+	int64_t end = serializer_get_pos(s);
+
+	/* Overwrite count with correct value */
+	serializer_seek(s, count_pos, SERIALIZE_SEEK_START);
+	s_wb32(s, count);
+	serializer_seek(s, end, SERIALIZE_SEEK_START);
+
+	return write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Metadata item atom, but name is an index instead
+static inline void write_key_entry(struct mp4_mux *mux, obs_data_item_t *item,
+				   uint32_t idx)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	s_wb32(s, 0);   // size
+	s_wb32(s, idx); // index
+
+	mp4_write_data_atom(mux, obs_data_item_get_string(item));
+
+	write_box_size(s, start);
+}
+
+/// (QTFF/Apple) Metadata item list atom
+static size_t mp4_write_mdta_ilst(struct mp4_mux *mux, obs_data_t *meta)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "ilst");
+
+	/* indices start with 1 */
+	uint32_t key_idx = 1;
+
+	obs_data_item_t *item = obs_data_first(meta);
+
+	for (; item != NULL; obs_data_item_next(&item)) {
+		write_key_entry(mux, item, key_idx);
+		key_idx++;
+	}
+
+	return write_box_size(s, start);
+}
+
+static void mp4_write_mdta_kv(struct mp4_mux *mux)
+{
+	struct dstr value = {0};
+
+	obs_data_t *meta = obs_data_create();
+
+	dstr_cat(&value, "OBS Studio (");
+	dstr_cat(&value, obs_get_version_string());
+	dstr_cat(&value, ")");
+
+	// ToDo figure out what else we could put in here for fun and profit :)
+	obs_data_set_string(meta, "tool", value.array);
+
+	/* Write keys */
+	mp4_write_mdta_keys(mux, meta);
+	/* Write values */
+	mp4_write_mdta_ilst(mux, meta);
+
+	obs_data_release(meta);
+	dstr_free(&value);
+}
+
+/// 8.11.1 The Meta box
+static size_t mp4_write_meta(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_fullbox(s, 0, "meta", 0, 0);
+
+	if (mux->flags & MP4_USE_MDTA_KEY_VALUE) {
+		mp4_write_mdta_hdlr(mux);
+		mp4_write_mdta_kv(mux);
+	} else {
+		mp4_write_itunes_hdlr(mux);
+		mp4_write_ilst(mux);
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.10.1 User Data Box
+static size_t mp4_write_udta(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "udta");
+
+	/* Normally metadata would be directly in the moov, but since this is
+	 * Apple/QTFF format metadata it is inside udta. */
+
+	// meta
+	mp4_write_meta(mux);
+
+	return write_box_size(s, start);
+}
+
+/// Movie Box (8.2.1)
+static size_t mp4_write_moov(struct mp4_mux *mux, bool fragmented)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "moov");
+
+	mp4_write_mvhd(mux);
+
+	// trak(s)
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		mp4_write_trak(mux, track, fragmented);
+	}
+
+	if (!fragmented && mux->chapter_track)
+		mp4_write_trak(mux, mux->chapter_track, false);
+
+	// mvex
+	if (fragmented)
+		mp4_write_mvex(mux);
+
+	// udta (metadata)
+	mp4_write_udta(mux);
+
+	return write_box_size(s, start);
+}
+
+/* ========================================================================== */
+/* moof (fragment header) stuff                                               */
+
+/// 8.8.5 Movie Fragment Header Box
+static size_t mp4_write_mfhd(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 16, "mfhd", 0, 0);
+
+	s_wb32(s, mux->fragments_written); // sequence_number
+
+	return 16;
+}
+
+/// 8.8.7 Track Fragment Header Box
+static size_t mp4_write_tfhd(struct mp4_mux *mux, struct mp4_track *track,
+			     size_t moof_start)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	uint32_t flags = BASE_DATA_OFFSET_PRESENT |
+			 DEFAULT_SAMPLE_FLAGS_PRESENT;
+
+	/* Add default size/duration if all samples match. */
+	bool durations_match = true;
+	bool sizes_match = true;
+	uint32_t duration;
+	uint32_t sample_size;
+
+	if (track->sample_size) {
+		duration = 1;
+		sample_size = track->sample_size;
+	} else {
+		duration = track->fragment_samples.array[0].duration;
+		sample_size = track->fragment_samples.array[0].size;
+
+		for (size_t idx = 1; idx < track->fragment_samples.num; idx++) {
+			uint32_t frag_duration =
+				track->fragment_samples.array[idx].duration;
+			uint32_t frag_size =
+				track->fragment_samples.array[idx].size;
+
+			durations_match = frag_duration == duration;
+			sizes_match = frag_size == sample_size;
+		}
+	}
+
+	if (durations_match)
+		flags |= DEFAULT_SAMPLE_DURATION_PRESENT;
+	if (sizes_match)
+		flags |= DEFAULT_SAMPLE_SIZE_PRESENT;
+
+	write_fullbox(s, 0, "tfhd", 0, flags);
+
+	s_wb32(s, track->track_id); // track_ID
+	s_wb64(s, moof_start);      // base_data_offset
+
+	// default_sample_duration
+	if (durations_match) {
+		if (track->type == TRACK_VIDEO) {
+			/* Convert duration to track timescale */
+			duration = (uint32_t)util_mul_div64(
+				duration, track->timescale,
+				track->timebase_den);
+		}
+
+		s_wb32(s, duration);
+	}
+	// default_sample_size
+	if (sizes_match)
+		s_wb32(s, sample_size);
+	// default_sample_flags
+	if (track->type == TRACK_VIDEO) {
+		s_wb32(s, SAMPLE_FLAG_DEPENDS_YES | SAMPLE_FLAG_IS_NON_SYNC);
+	} else {
+		s_wb32(s, SAMPLE_FLAG_DEPENDS_NO);
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.8.12 Track fragment decode time
+static size_t mp4_write_tfdt(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+
+	write_fullbox(s, 20, "tfdt", 1, 0);
+
+	/* Subtract samples that are not written yet */
+	uint64_t duration_written = track->duration;
+	for (size_t i = 0; i < track->fragment_samples.num; i++)
+		duration_written -= track->fragment_samples.array[i].duration;
+
+	if (track->type == TRACK_VIDEO) {
+		/* Convert to track timescale */
+		duration_written = util_mul_div64(duration_written,
+						  track->timescale,
+						  track->timebase_den);
+	}
+
+	s_wb64(s, duration_written); // baseMediaDecodeTime
+
+	return 20;
+}
+
+/// 8.8.8 Track Fragment Run Box
+static size_t mp4_write_trun(struct mp4_mux *mux, struct mp4_track *track,
+			     uint32_t moof_size, uint64_t *samples_mdat_offset)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	uint32_t flags = DATA_OFFSET_PRESENT;
+
+	if (!track->sample_size)
+		flags |= SAMPLE_SIZE_PRESENT;
+
+	if (track->type == TRACK_VIDEO) {
+		flags |= FIRST_SAMPLE_FLAGS_PRESENT;
+		flags |= SAMPLE_COMPOSITION_TIME_OFFSETS_PRESENT;
+	}
+
+	uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0;
+
+	write_fullbox(s, 0, "trun", version, flags);
+
+	/* moof_size + 8 bytes for mdat header + offset into mdat box data */
+	size_t data_offset = moof_size + 8 + *samples_mdat_offset;
+	size_t sample_count = track->fragment_samples.num;
+
+	if (track->sample_size) {
+		/* Update count based on fixed size */
+		size_t total_size = 0;
+		for (size_t i = 0; i < sample_count; i++)
+			total_size += track->fragment_samples.array[i].size;
+
+		*samples_mdat_offset += total_size;
+		sample_count = total_size / track->sample_size;
+	}
+
+	s_wb32(s, (uint32_t)sample_count); // sample_count
+	s_wb32(s, (uint32_t)data_offset);  // data_offset
+
+	/* If we have a fixed sample size (PCM audio) we only need to write
+	 * the sample count and offset. */
+	if (track->sample_size)
+		return write_box_size(s, start);
+
+	if (track->type == TRACK_VIDEO)
+		s_wb32(s, SAMPLE_FLAG_DEPENDS_NO); // first_sample_flags
+
+	for (size_t idx = 0; idx < sample_count; idx++) {
+		struct fragment_sample *smp =
+			&track->fragment_samples.array[idx];
+
+		s_wb32(s, smp->size); // sample_size
+
+		if (track->type == TRACK_VIDEO) {
+			// sample_composition_time_offset
+			int64_t offset = (int64_t)smp->offset *
+					 (int64_t)track->timescale /
+					 (int64_t)track->timebase_den;
+			s_wb32(s, (uint32_t)offset);
+		}
+
+		*samples_mdat_offset += smp->size;
+	}
+
+	return write_box_size(s, start);
+}
+
+/// 8.8.6 Track Fragment Box
+static size_t mp4_write_traf(struct mp4_mux *mux, struct mp4_track *track,
+			     int64_t moof_start, uint32_t moof_size,
+			     uint64_t *samples_mdat_offset)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "traf");
+
+	// tfhd
+	mp4_write_tfhd(mux, track, moof_start);
+
+	// tfdt
+	mp4_write_tfdt(mux, track);
+
+	// trun
+	mp4_write_trun(mux, track, moof_size, samples_mdat_offset);
+
+	return write_box_size(s, start);
+}
+
+/// 8.8.4 Movie Fragment Box
+static size_t mp4_write_moof(struct mp4_mux *mux, uint32_t moof_size,
+			     int64_t moof_start)
+{
+	struct serializer *s = mux->serializer;
+	int64_t start = serializer_get_pos(s);
+
+	write_box(s, 0, "moof");
+
+	mp4_write_mfhd(mux);
+
+	/* Track current mdat offset across tracks */
+	uint64_t samples_mdat_offset = 0;
+
+	// traf boxes
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		/* Skip tracks that do not have any samples */
+		if (!track->fragment_samples.num)
+			continue;
+
+		mp4_write_traf(mux, track, moof_start, moof_size,
+			       &samples_mdat_offset);
+	}
+
+	return write_box_size(s, start);
+}
+
+/* ========================================================================== */
+/* Chapter packets                                                            */
+
+static void mp4_create_chapter_pkt(struct encoder_packet *pkt, int64_t dts_usec,
+				   const char *name)
+{
+	int64_t dts = dts_usec / 1000; // chapter track uses a ms timebase
+
+	pkt->pts = dts;
+	pkt->dts = dts;
+	pkt->dts_usec = dts_usec;
+	pkt->timebase_num = 1;
+	pkt->timebase_den = 1000;
+
+	/* Serialize with data with ref count */
+	struct serializer s;
+	struct array_output_data ao;
+	array_output_serializer_init(&s, &ao);
+
+	size_t len = min(strlen(name), UINT16_MAX);
+	long refs = 1;
+
+	/* encoder_packet refs */
+	s_write(&s, &refs, sizeof(refs));
+	/* actual packet data */
+	s_wb16(&s, (uint16_t)len);
+	s_write(&s, name, len);
+	s_write(&s, &CHAPTER_PKT_FOOTER, sizeof(CHAPTER_PKT_FOOTER));
+
+	pkt->data = (void *)(ao.bytes.array + sizeof(long));
+	pkt->size = ao.bytes.num - sizeof(long);
+}
+
+/* ========================================================================== */
+/* Encoder packet processing and fragment writer                              */
+
+static inline int64_t packet_pts_usec(struct encoder_packet *packet)
+{
+	return packet->pts * 1000000 / packet->timebase_den;
+}
+
+static inline struct encoder_packet *get_pkt_at(struct deque *dq, size_t idx)
+{
+	return deque_data(dq, idx * sizeof(struct encoder_packet));
+}
+
+static inline uint64_t get_longest_track_duration(struct mp4_mux *mux)
+{
+	uint64_t dur = 0;
+
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		uint64_t track_dur = util_mul_div64(track->duration, 1000,
+						    track->timebase_den);
+
+		if (track_dur > dur)
+			dur = track_dur;
+	}
+
+	return dur;
+}
+
+static void process_packets(struct mp4_mux *mux, struct mp4_track *track,
+			    uint64_t *mdat_size)
+{
+	size_t count = track->packets.size / sizeof(struct encoder_packet);
+
+	if (!count)
+		return;
+
+	/* Only iterate upt to penultimate packet so we can determine duration
+	 * for all processed packets. */
+	for (size_t i = 0; i < count - 1; i++) {
+		struct encoder_packet *pkt = get_pkt_at(&track->packets, i);
+
+		if (mux->next_frag_pts &&
+		    packet_pts_usec(pkt) >= mux->next_frag_pts)
+			break;
+
+		struct encoder_packet *next =
+			get_pkt_at(&track->packets, i + 1);
+
+		/* Duration is just distance between current and next DTS. */
+		uint32_t duration = (uint32_t)(next->dts - pkt->dts);
+		uint32_t sample_count = 1;
+		uint32_t size = (uint32_t)pkt->size;
+		int32_t offset = (int32_t)(pkt->pts - pkt->dts);
+
+		/* When using negative CTS, subtract DTS-PTS offset. */
+		if (track->type == TRACK_VIDEO &&
+		    mux->flags & MP4_USE_NEGATIVE_CTS) {
+			if (!track->offsets.num)
+				track->dts_offset = offset;
+
+			offset -= track->dts_offset;
+		}
+
+		/* Create temporary sample information for moof */
+		struct fragment_sample *smp =
+			da_push_back_new(track->fragment_samples);
+		smp->size = size;
+		smp->offset = offset;
+		smp->duration = duration;
+
+		*mdat_size += size;
+
+		/* Update global sample information for full moov */
+		track->duration += duration;
+
+		if (track->sample_size) {
+			/* Adjust duration/count for fixed sample size */
+			sample_count = size / track->sample_size;
+			duration = 1;
+		}
+
+		if (!track->samples)
+			track->first_pts = pkt->pts;
+
+		track->samples += sample_count;
+
+		/* If delta (duration) matche sprevious, increment counter,
+		 * otherwise create a new entry. */
+		if (track->deltas.num == 0 ||
+		    track->deltas.array[track->deltas.num - 1].delta !=
+			    duration) {
+			struct sample_delta *new =
+				da_push_back_new(track->deltas);
+			new->delta = duration;
+			new->count = sample_count;
+		} else {
+			track->deltas.array[track->deltas.num - 1].count +=
+				sample_count;
+		}
+
+		if (!track->sample_size)
+			da_push_back(track->sample_sizes, &size);
+
+		if (track->type != TRACK_VIDEO)
+			continue;
+
+		if (pkt->keyframe)
+			da_push_back(track->sync_samples, &track->samples);
+
+		/* Only require ctts box if offet is non-zero */
+		if (offset && !track->needs_ctts)
+			track->needs_ctts = true;
+
+		/* If dts-pts offset matche sprevious, increment counter,
+		 * otherwise create a new entry. */
+		if (track->offsets.num == 0 ||
+		    track->offsets.array[track->offsets.num - 1].offset !=
+			    offset) {
+			struct sample_offset *new =
+				da_push_back_new(track->offsets);
+			new->offset = offset;
+			new->count = 1;
+		} else {
+			track->offsets.array[track->offsets.num - 1].count += 1;
+		}
+	}
+}
+
+/* Write track data to file */
+static void write_packets(struct mp4_mux *mux, struct mp4_track *track)
+{
+	struct serializer *s = mux->serializer;
+
+	size_t count = track->packets.size / sizeof(struct encoder_packet);
+	if (!count)
+		return;
+
+	struct chunk *chk = da_push_back_new(track->chunks);
+	chk->offset = serializer_get_pos(s);
+	chk->samples = (uint32_t)track->fragment_samples.num;
+
+	for (size_t i = 0; i < track->fragment_samples.num; i++) {
+		struct encoder_packet pkt;
+		deque_pop_front(&track->packets, &pkt,
+				sizeof(struct encoder_packet));
+		s_write(s, pkt.data, pkt.size);
+		obs_encoder_packet_release(&pkt);
+	}
+
+	chk->size = (uint32_t)(serializer_get_pos(s) - chk->offset);
+
+	/* Fixup sample count for fixed-size codecs */
+	if (track->sample_size)
+		chk->samples = chk->size / track->sample_size;
+
+	da_clear(track->fragment_samples);
+}
+
+static void mp4_flush_fragment(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	// Write file header if not already done
+	if (!mux->fragments_written) {
+		mp4_write_ftyp(mux, true);
+		/* Placeholder to write mdat header during soft-remux */
+		mux->placeholder_offset = serializer_get_pos(s);
+		mp4_write_free(mux);
+	}
+
+	// Array output as temporary buffer to avoid sending seeks to disk
+	struct serializer as;
+	struct array_output_data aod;
+	array_output_serializer_init(&as, &aod);
+	mux->serializer = &as;
+
+	// Write initial incomplete moov (because fragmentation)
+	if (!mux->fragments_written) {
+		mp4_write_moov(mux, true);
+		s_write(s, aod.bytes.array, aod.bytes.num);
+		array_output_serializer_reset(&aod);
+	}
+
+	mux->fragments_written++;
+
+	/* --------------------------------------------------------- */
+	/* Analyse packets and create fragment moof.                 */
+
+	uint64_t mdat_size = 8;
+
+	for (size_t idx = 0; idx < mux->tracks.num; idx++) {
+		struct mp4_track *track = &mux->tracks.array[idx];
+		process_packets(mux, track, &mdat_size);
+	}
+
+	if (!mux->next_frag_pts && mux->chapter_track) {
+		// Create dummy chapter marker at the end so duration is correct
+		uint64_t duration = get_longest_track_duration(mux);
+		struct encoder_packet pkt;
+		mp4_create_chapter_pkt(&pkt, (int64_t)duration * 1000, "Dummy");
+		deque_push_back(&mux->chapter_track->packets, &pkt,
+				sizeof(struct encoder_packet));
+
+		process_packets(mux, mux->chapter_track, &mdat_size);
+	}
+
+	// write moof once to get size
+	int64_t moof_start = serializer_get_pos(s);
+	size_t moof_size = mp4_write_moof(mux, 0, moof_start);
+	array_output_serializer_reset(&aod);
+
+	// write moof again with known size
+	mp4_write_moof(mux, (uint32_t)moof_size, moof_start);
+
+	// Write to output and restore real serializer
+	s_write(s, aod.bytes.array, aod.bytes.num);
+	mux->serializer = s;
+	array_output_serializer_free(&aod);
+
+	/* --------------------------------------------------------- */
+	/* Write audio and video samples (in chunks). Also update    */
+	/* global chunk and sample information for final moov.       */
+
+	if (mdat_size > UINT32_MAX) {
+		s_wb32(s, 1);
+		s_write(s, "mdat", 4);
+		s_wb64(s, mdat_size + 8);
+	} else {
+		s_wb32(s, (uint32_t)mdat_size);
+		s_write(s, "mdat", 4);
+	}
+
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *track = &mux->tracks.array[i];
+		write_packets(mux, track);
+	}
+
+	/* Only write chapter packets on final flush. */
+	if (!mux->next_frag_pts && mux->chapter_track)
+		write_packets(mux, mux->chapter_track);
+
+	mux->next_frag_pts = 0;
+}
+
+/* ========================================================================== */
+/* Track object functions                                                     */
+
+static inline void track_insert_packet(struct mp4_track *track,
+				       struct encoder_packet *pkt)
+{
+	int64_t pts_usec = packet_pts_usec(pkt);
+	if (pts_usec > track->last_pts_usec)
+		track->last_pts_usec = pts_usec;
+
+	deque_push_back(&track->packets, pkt, sizeof(struct encoder_packet));
+}
+
+static inline uint32_t get_sample_size(struct mp4_track *track)
+{
+	audio_t *audio = obs_encoder_audio(track->encoder);
+	if (!audio)
+		return 0;
+
+	const struct audio_output_info *info = audio_output_get_info(audio);
+	uint32_t channels = get_audio_channels(info->speakers);
+
+	switch (track->codec) {
+	case CODEC_PCM_F32:
+		return channels * 4; // 4 bytes per sample (32-bit)
+	case CODEC_PCM_I24:
+		return channels * 3; // 3 bytes per sample (24-bit)
+	case CODEC_PCM_I16:
+		return channels * 2; // 2 bytes per sample (16-bit)
+	default:
+		return 0;
+	}
+}
+
+static inline enum mp4_codec get_codec(obs_encoder_t *enc)
+{
+	const char *codec = obs_encoder_get_codec(enc);
+
+	if (strcmp(codec, "h264") == 0)
+		return CODEC_H264;
+	if (strcmp(codec, "hevc") == 0)
+		return CODEC_HEVC;
+	if (strcmp(codec, "av1") == 0)
+		return CODEC_AV1;
+	if (strcmp(codec, "aac") == 0)
+		return CODEC_AAC;
+	if (strcmp(codec, "opus") == 0)
+		return CODEC_OPUS;
+	if (strcmp(codec, "flac") == 0)
+		return CODEC_FLAC;
+	if (strcmp(codec, "alac") == 0)
+		return CODEC_ALAC;
+	if (strcmp(codec, "pcm_s16le") == 0)
+		return CODEC_PCM_I16;
+	if (strcmp(codec, "pcm_s24le") == 0)
+		return CODEC_PCM_I24;
+	if (strcmp(codec, "pcm_f32le") == 0)
+		return CODEC_PCM_F32;
+
+	return CODEC_UNKNOWN;
+}
+
+static inline void add_track(struct mp4_mux *mux, obs_encoder_t *enc)
+{
+	struct mp4_track *track = da_push_back_new(mux->tracks);
+
+	track->type = obs_encoder_get_type(enc) == OBS_ENCODER_VIDEO
+			      ? TRACK_VIDEO
+			      : TRACK_AUDIO;
+	track->encoder = obs_encoder_get_ref(enc);
+	track->codec = get_codec(enc);
+	track->track_id = ++mux->track_ctr;
+
+	/* Set timebase/timescale */
+	if (track->type == TRACK_VIDEO) {
+		video_t *video = obs_encoder_video(enc);
+		const struct video_output_info *info =
+			video_output_get_info(video);
+		track->timebase_num = info->fps_den;
+		track->timebase_den = info->fps_num;
+
+		track->timescale = track->timebase_den;
+		/* FFmpeg does this to compensate for non-monotonic timestamps,
+		 * we probably don't need it, but let's stick to what they do
+		 * for maximum compatibility. */
+		while (track->timescale < 10000)
+			track->timescale *= 2;
+	} else {
+		uint32_t sample_rate = obs_encoder_get_sample_rate(enc);
+		/* Opus is always 48 kHz */
+		if (track->codec == CODEC_OPUS)
+			sample_rate = 48000;
+		track->timebase_num = 1;
+		track->timebase_den = sample_rate;
+		track->timescale = sample_rate;
+	}
+
+	/* Set sample size (if fixed) */
+	if (track->type == TRACK_AUDIO)
+		track->sample_size = get_sample_size(track);
+}
+
+static inline void add_chapter_track(struct mp4_mux *mux)
+{
+	mux->chapter_track = bzalloc(sizeof(struct mp4_track));
+	mux->chapter_track->type = TRACK_CHAPTERS;
+	mux->chapter_track->codec = CODEC_TEXT;
+	mux->chapter_track->timescale = 1000;
+	mux->chapter_track->timebase_num = 1;
+	mux->chapter_track->timebase_den = 1000;
+	mux->chapter_track->track_id = ++mux->track_ctr;
+}
+
+static inline void free_packets(struct deque *dq)
+{
+	size_t num = dq->size / sizeof(struct encoder_packet);
+
+	for (size_t i = 0; i < num; i++) {
+		struct encoder_packet pkt;
+		deque_pop_front(dq, &pkt, sizeof(struct encoder_packet));
+		obs_encoder_packet_release(&pkt);
+	}
+}
+
+static inline void free_track(struct mp4_track *track)
+{
+	if (!track)
+		return;
+
+	obs_encoder_release(track->encoder);
+
+	free_packets(&track->packets);
+	deque_free(&track->packets);
+
+	da_free(track->sample_sizes);
+	da_free(track->chunks);
+	da_free(track->deltas);
+	da_free(track->offsets);
+	da_free(track->sync_samples);
+	da_free(track->fragment_samples);
+}
+
+/* ===========================================================================*/
+/* API */
+
+struct mp4_mux *mp4_mux_create(obs_output_t *output,
+			       struct serializer *serializer,
+			       enum mp4_mux_flags flags)
+{
+	struct mp4_mux *mux = bzalloc(sizeof(struct mp4_mux));
+
+	mux->output = output;
+	mux->serializer = serializer;
+	mux->flags = flags;
+	/* Timestamp is based on 1904 rather than 1970. */
+	mux->creation_time = time(NULL) + 0x7C25B080;
+
+	for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) {
+		obs_encoder_t *enc = obs_output_get_video_encoder2(output, i);
+		if (!enc)
+			continue;
+		add_track(mux, enc);
+	}
+
+	for (size_t i = 0; i < MAX_OUTPUT_AUDIO_ENCODERS; i++) {
+		obs_encoder_t *enc = obs_output_get_audio_encoder(output, i);
+		if (!enc)
+			continue;
+		add_track(mux, enc);
+	}
+
+	return mux;
+}
+
+void mp4_mux_destroy(struct mp4_mux *mux)
+{
+	for (size_t i = 0; i < mux->tracks.num; i++)
+		free_track(&mux->tracks.array[i]);
+
+	free_track(mux->chapter_track);
+	bfree(mux->chapter_track);
+	da_free(mux->tracks);
+	bfree(mux);
+}
+
+bool mp4_mux_submit_packet(struct mp4_mux *mux, struct encoder_packet *pkt)
+{
+	struct mp4_track *track = NULL;
+	struct encoder_packet parsed_packet;
+	enum obs_encoder_type type = pkt->type;
+	bool fragment_ready = mux->next_frag_pts > 0;
+
+	for (size_t i = 0; i < mux->tracks.num; i++) {
+		struct mp4_track *tmp = &mux->tracks.array[i];
+
+		fragment_ready = fragment_ready &&
+				 tmp->last_pts_usec >= mux->next_frag_pts;
+
+		if (tmp->encoder == pkt->encoder)
+			track = tmp;
+	}
+
+	if (!track) {
+		warn("Could not find track for packet of type %s with "
+		     "track id %zu!",
+		     type == OBS_ENCODER_VIDEO ? "video" : "audio",
+		     pkt->track_idx);
+		return false;
+	}
+
+	/* If all tracks have caught up to the keyframe we want to fragment on,
+	 * flush the current fragment to disk. */
+	if (fragment_ready)
+		mp4_flush_fragment(mux);
+
+	if (type == OBS_ENCODER_AUDIO) {
+		obs_encoder_packet_ref(&parsed_packet, pkt);
+	} else {
+		if (track->codec == CODEC_H264)
+			obs_parse_avc_packet(&parsed_packet, pkt);
+		else if (track->codec == CODEC_HEVC)
+			obs_parse_hevc_packet(&parsed_packet, pkt);
+		else if (track->codec == CODEC_AV1)
+			obs_parse_av1_packet(&parsed_packet, pkt);
+
+		/* Set fragmentation PTS if packet is keyframe and PTS > 0 */
+		if (parsed_packet.keyframe && parsed_packet.pts > 0) {
+			mux->next_frag_pts = packet_pts_usec(&parsed_packet);
+		}
+	}
+
+	track_insert_packet(track, &parsed_packet);
+
+	return true;
+}
+
+bool mp4_mux_add_chapter(struct mp4_mux *mux, int64_t dts_usec,
+			 const char *name)
+{
+	if (dts_usec < 0)
+		return false;
+	if (!mux->chapter_track)
+		add_chapter_track(mux);
+
+	/* To work correctly there needs to be a chapter at PTS 0,
+	 * create that here if necessary. */
+	if (dts_usec > 0 && mux->chapter_track->packets.size == 0) {
+		mp4_mux_add_chapter(mux, 0,
+				    obs_module_text("MP4Output.StartChapter"));
+	}
+
+	/* Create packets that will be muxed on final flush */
+	struct encoder_packet pkt;
+	mp4_create_chapter_pkt(&pkt, dts_usec, name);
+	track_insert_packet(mux->chapter_track, &pkt);
+
+	return true;
+}
+
+bool mp4_mux_finalise(struct mp4_mux *mux)
+{
+	struct serializer *s = mux->serializer;
+
+	/* Flush remaining audio/video samples as final fragment. */
+	info("Flushing final fragment...");
+
+	/* Set target PTS to zero to indicate that we want to flush all
+	 * the remaining packets */
+	mux->next_frag_pts = 0;
+	mp4_flush_fragment(mux);
+
+	info("Number of fragments: %u", mux->fragments_written);
+
+	if (mux->flags & MP4_SKIP_FINALISATION) {
+		warn("Skipping MP4 finalization!");
+		return true;
+	}
+
+	int64_t data_end = serializer_get_pos(s);
+
+	/* ---------------------------------------- */
+	/* Write full moov box                      */
+
+	/* Use array serializer for moov data as this will do a lot
+	 * of seeks to write size values of variable-size boxes. */
+	struct serializer fs;
+	struct array_output_data ao;
+	array_output_serializer_init(&fs, &ao);
+
+	mux->serializer = &fs;
+
+	mp4_write_moov(mux, false);
+	s_write(s, ao.bytes.array, ao.bytes.num);
+	info("Full moov size: %zu KiB", ao.bytes.num / 1024);
+
+	mux->serializer = s; // restore real serializer
+	array_output_serializer_free(&ao);
+
+	/* ---------------------------------------- */
+	/* Overwrite file header (ftyp + free/moov) */
+
+	serializer_seek(s, 0, SERIALIZE_SEEK_START);
+	mp4_write_ftyp(mux, false);
+
+	size_t data_size = data_end - mux->placeholder_offset;
+	serializer_seek(s, (int64_t)mux->placeholder_offset,
+			SERIALIZE_SEEK_START);
+
+	/* If data is more than 4 GiB the mdat header becomes 16 bytes, hence
+	 * why we create a 16-byte placeholder "free" box at the start. */
+	if (data_size > UINT32_MAX) {
+		s_wb32(s, 1); // 1 = use "largesize" field instead
+		s_write(s, "mdat", 4);
+		s_wb64(s, data_size); // largesize (64-bit)
+	} else {
+		s_wb32(s, (uint32_t)data_size);
+		s_write(s, "mdat", 4);
+	}
+
+	info("Final mdat size: %zu KiB", data_size / 1024);
+	return true;
+}

+ 43 - 0
plugins/obs-outputs/mp4-mux.h

@@ -0,0 +1,43 @@
+/******************************************************************************
+    Copyright (C) 2024 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#pragma once
+
+#include <obs.h>
+#include <util/serializer.h>
+
+struct mp4_mux;
+
+enum mp4_mux_flags {
+	/* Uses mdta key/value list for metadata instead of QuickTime keys */
+	MP4_USE_MDTA_KEY_VALUE = 1 << 0,
+	/* Write encoder configuration to trak udat */
+	MP4_WRITE_ENCODER_INFO = 1 << 1,
+	/* Skip "soft-remux" and leave file in fragmented state */
+	MP4_SKIP_FINALISATION = 1 << 2,
+	/* Use negative CTS instead of edit lists */
+	MP4_USE_NEGATIVE_CTS = 1 << 3,
+};
+
+struct mp4_mux *mp4_mux_create(obs_output_t *output,
+			       struct serializer *serializer,
+			       enum mp4_mux_flags flags);
+void mp4_mux_destroy(struct mp4_mux *mux);
+bool mp4_mux_submit_packet(struct mp4_mux *mux, struct encoder_packet *pkt);
+bool mp4_mux_add_chapter(struct mp4_mux *mux, int64_t dts_usec,
+			 const char *name);
+bool mp4_mux_finalise(struct mp4_mux *mux);

+ 613 - 0
plugins/obs-outputs/mp4-output.c

@@ -0,0 +1,613 @@
+/******************************************************************************
+    Copyright (C) 2024 by Dennis Sädtler <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#include "mp4-mux.h"
+
+#include <inttypes.h>
+
+#include <obs-module.h>
+#include <util/platform.h>
+#include <util/dstr.h>
+#include <util/threading.h>
+#include <util/buffered-file-serializer.h>
+
+#include <opts-parser.h>
+
+#define do_log(level, format, ...)                \
+	blog(level, "[mp4 output: '%s'] " format, \
+	     obs_output_get_name(out->output), ##__VA_ARGS__)
+
+#define warn(format, ...) do_log(LOG_WARNING, format, ##__VA_ARGS__)
+#define info(format, ...) do_log(LOG_INFO, format, ##__VA_ARGS__)
+
+struct chapter {
+	int64_t dts_usec;
+	char *name;
+};
+
+struct mp4_output {
+	obs_output_t *output;
+	struct dstr path;
+
+	struct serializer serializer;
+
+	volatile bool active;
+	volatile bool stopping;
+	uint64_t stop_ts;
+
+	bool allow_overwrite;
+	uint64_t total_bytes;
+
+	pthread_mutex_t mutex;
+
+	struct mp4_mux *muxer;
+	int flags;
+
+	int64_t last_dts_usec;
+	DARRAY(struct chapter) chapters;
+
+	/* File splitting stuff */
+	bool split_file_enabled;
+	bool split_file_ready;
+	volatile bool manual_split;
+
+	size_t cur_size;
+	size_t max_size;
+
+	int64_t start_time;
+	int64_t max_time;
+
+	bool found_video[MAX_OUTPUT_VIDEO_ENCODERS];
+	bool found_audio[MAX_OUTPUT_AUDIO_ENCODERS];
+	int64_t video_pts_offsets[MAX_OUTPUT_VIDEO_ENCODERS];
+	int64_t audio_dts_offsets[MAX_OUTPUT_AUDIO_ENCODERS];
+
+	/* Buffer for packets while we reinitialise the muxer after splitting */
+	DARRAY(struct encoder_packet) split_buffer;
+};
+
+static inline bool stopping(struct mp4_output *out)
+{
+	return os_atomic_load_bool(&out->stopping);
+}
+
+static inline bool active(struct mp4_output *out)
+{
+	return os_atomic_load_bool(&out->active);
+}
+
+static inline int64_t packet_pts_usec(struct encoder_packet *packet)
+{
+	return packet->pts * 1000000 / packet->timebase_den;
+}
+
+static inline void ts_offset_clear(struct mp4_output *out)
+{
+	for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) {
+		out->found_video[i] = false;
+		out->video_pts_offsets[i] = 0;
+	}
+
+	for (size_t i = 0; i < MAX_OUTPUT_AUDIO_ENCODERS; i++) {
+		out->found_audio[i] = false;
+		out->audio_dts_offsets[i] = 0;
+	}
+}
+
+static inline void ts_offset_update(struct mp4_output *out,
+				    struct encoder_packet *packet)
+{
+	int64_t *offset;
+	bool *found;
+
+	if (packet->type == OBS_ENCODER_VIDEO) {
+		offset = &out->video_pts_offsets[packet->track_idx];
+		found = &out->found_video[packet->track_idx];
+	} else {
+		offset = &out->audio_dts_offsets[packet->track_idx];
+		found = &out->found_audio[packet->track_idx];
+	}
+
+	if (*found)
+		return;
+
+	*offset = packet->dts;
+	*found = true;
+}
+
+static const char *mp4_output_name(void *unused)
+{
+	UNUSED_PARAMETER(unused);
+	return obs_module_text("MP4Output");
+}
+
+static void mp4_output_destory(void *data)
+{
+	struct mp4_output *out = data;
+
+	for (size_t i = 0; i < out->chapters.num; i++)
+		bfree(out->chapters.array[i].name);
+	da_free(out->chapters);
+
+	pthread_mutex_destroy(&out->mutex);
+	dstr_free(&out->path);
+	bfree(out);
+}
+
+static void mp4_add_chapter_proc(void *data, calldata_t *cd)
+{
+	struct mp4_output *out = data;
+	struct dstr name = {0};
+
+	dstr_copy(&name, calldata_string(cd, "chapter_name"));
+
+	if (name.len == 0) {
+		/* Generate name if none provided. */
+		dstr_catf(&name, "%s %zu",
+			  obs_module_text("MP4Output.UnnamedChapter"),
+			  out->chapters.num + 1);
+	}
+
+	int64_t totalRecordSeconds = out->last_dts_usec / 1000 / 1000;
+	int seconds = (int)totalRecordSeconds % 60;
+	int totalMinutes = (int)totalRecordSeconds / 60;
+	int minutes = totalMinutes % 60;
+	int hours = totalMinutes / 60;
+
+	info("Adding chapter \"%s\" at %02d:%02d:%02d", name.array, hours,
+	     minutes, seconds);
+
+	pthread_mutex_lock(&out->mutex);
+	struct chapter *chap = da_push_back_new(out->chapters);
+	chap->dts_usec = out->last_dts_usec;
+	chap->name = name.array;
+	pthread_mutex_unlock(&out->mutex);
+}
+
+static void split_file_proc(void *data, calldata_t *cd)
+{
+	struct mp4_output *out = data;
+
+	calldata_set_bool(cd, "split_file_enabled", out->split_file_enabled);
+	if (!out->split_file_enabled)
+		return;
+
+	os_atomic_set_bool(&out->manual_split, true);
+}
+
+static void *mp4_output_create(obs_data_t *settings, obs_output_t *output)
+{
+	struct mp4_output *out = bzalloc(sizeof(struct mp4_output));
+	out->output = output;
+	pthread_mutex_init(&out->mutex, NULL);
+
+	signal_handler_t *sh = obs_output_get_signal_handler(output);
+	signal_handler_add(sh, "void file_changed(string next_file)");
+
+	proc_handler_t *ph = obs_output_get_proc_handler(output);
+	proc_handler_add(ph, "void split_file(out bool split_file_enabled)",
+			 split_file_proc, out);
+	proc_handler_add(ph, "void add_chapter(string chapter_name)",
+			 mp4_add_chapter_proc, out);
+
+	UNUSED_PARAMETER(settings);
+	return out;
+}
+
+static inline void apply_flag(int *flags, const char *value, int flag_value)
+{
+	if (atoi(value))
+		*flags |= flag_value;
+	else
+		*flags &= ~flag_value;
+}
+
+static int parse_custom_options(const char *opts_str)
+{
+	int flags = MP4_USE_NEGATIVE_CTS;
+
+	struct obs_options opts = obs_parse_options(opts_str);
+
+	for (size_t i = 0; i < opts.count; i++) {
+		struct obs_option opt = opts.options[i];
+
+		if (strcmp(opt.name, "skip_soft_remux") == 0) {
+			apply_flag(&flags, opt.value, MP4_SKIP_FINALISATION);
+		} else if (strcmp(opt.name, "write_encoder_info") == 0) {
+			apply_flag(&flags, opt.value, MP4_WRITE_ENCODER_INFO);
+		} else if (strcmp(opt.name, "use_metadata_tags") == 0) {
+			apply_flag(&flags, opt.value, MP4_USE_MDTA_KEY_VALUE);
+		} else if (strcmp(opt.name, "use_negative_cts") == 0) {
+			apply_flag(&flags, opt.value, MP4_USE_NEGATIVE_CTS);
+		} else {
+			blog(LOG_WARNING, "Unknown muxer option: %s = %s",
+			     opt.name, opt.value);
+		}
+	}
+
+	obs_free_options(opts);
+
+	return flags;
+}
+
+static bool mp4_output_start(void *data)
+{
+	struct mp4_output *out = data;
+
+	if (!obs_output_can_begin_data_capture(out->output, 0))
+		return false;
+	if (!obs_output_initialize_encoders(out->output, 0))
+		return false;
+
+	os_atomic_set_bool(&out->stopping, false);
+
+	/* get path */
+	obs_data_t *settings = obs_output_get_settings(out->output);
+	const char *path = obs_data_get_string(settings, "path");
+	dstr_copy(&out->path, path);
+
+	out->max_time = obs_data_get_int(settings, "max_time_sec") * 1000000LL;
+	out->max_size = obs_data_get_int(settings, "max_size_mb") * 1024 * 1024;
+	out->split_file_enabled = obs_data_get_bool(settings, "split_file");
+	out->allow_overwrite = obs_data_get_bool(settings, "allow_overwrite");
+	out->cur_size = 0;
+
+	/* Allow skipping the remux step for debugging purposes. */
+	const char *muxer_settings =
+		obs_data_get_string(settings, "muxer_settings");
+	out->flags = parse_custom_options(muxer_settings);
+
+	obs_data_release(settings);
+
+	if (!buffered_file_serializer_init_defaults(&out->serializer,
+						    out->path.array)) {
+		warn("Unable to open MP4 file '%s'", out->path.array);
+		return false;
+	}
+
+	/* Initialise muxer and start capture */
+	out->muxer = mp4_mux_create(out->output, &out->serializer, out->flags);
+	os_atomic_set_bool(&out->active, true);
+	obs_output_begin_data_capture(out->output, 0);
+
+	info("Writing Hybrid MP4 file '%s'...", out->path.array);
+	return true;
+}
+
+static inline bool should_split(struct mp4_output *out,
+				struct encoder_packet *packet)
+{
+	/* split at video frame on primary track */
+	if (packet->type != OBS_ENCODER_VIDEO || packet->track_idx > 0)
+		return false;
+
+	/* don't split group of pictures */
+	if (!packet->keyframe)
+		return false;
+
+	if (os_atomic_load_bool(&out->manual_split))
+		return true;
+
+	/* reached maximum file size */
+	if (out->max_size > 0 &&
+	    out->cur_size + (int64_t)packet->size >= out->max_size)
+		return true;
+
+	/* reached maximum duration */
+	if (out->max_time > 0 &&
+	    packet->dts_usec - out->start_time >= out->max_time)
+		return true;
+
+	return false;
+}
+
+static void find_best_filename(struct dstr *path, bool space)
+{
+	int num = 2;
+
+	if (!os_file_exists(path->array))
+		return;
+
+	const char *ext = strrchr(path->array, '.');
+	if (!ext)
+		return;
+
+	size_t extstart = ext - path->array;
+	struct dstr testpath;
+	dstr_init_copy_dstr(&testpath, path);
+	for (;;) {
+		dstr_resize(&testpath, extstart);
+		dstr_catf(&testpath, space ? " (%d)" : "_%d", num++);
+		dstr_cat(&testpath, ext);
+
+		if (!os_file_exists(testpath.array)) {
+			dstr_free(path);
+			dstr_init_move(path, &testpath);
+			break;
+		}
+	}
+}
+
+static void generate_filename(struct mp4_output *out, struct dstr *dst,
+			      bool overwrite)
+{
+	obs_data_t *settings = obs_output_get_settings(out->output);
+	const char *dir = obs_data_get_string(settings, "directory");
+	const char *fmt = obs_data_get_string(settings, "format");
+	const char *ext = obs_data_get_string(settings, "extension");
+	bool space = obs_data_get_bool(settings, "allow_spaces");
+
+	char *filename = os_generate_formatted_filename(ext, space, fmt);
+
+	dstr_copy(dst, dir);
+	dstr_replace(dst, "\\", "/");
+	if (dstr_end(dst) != '/')
+		dstr_cat_ch(dst, '/');
+	dstr_cat(dst, filename);
+
+	char *slash = strrchr(dst->array, '/');
+	if (slash) {
+		*slash = 0;
+		os_mkdirs(dst->array);
+		*slash = '/';
+	}
+
+	if (!overwrite)
+		find_best_filename(dst, space);
+
+	bfree(filename);
+	obs_data_release(settings);
+}
+
+static bool change_file(struct mp4_output *out, struct encoder_packet *pkt)
+{
+	uint64_t start_time = os_gettime_ns();
+
+	/* finalise file */
+	for (size_t i = 0; i < out->chapters.num; i++) {
+		struct chapter *chap = &out->chapters.array[i];
+		mp4_mux_add_chapter(out->muxer, chap->dts_usec, chap->name);
+	}
+
+	mp4_mux_finalise(out->muxer);
+
+	info("Waiting for file writer to finish...");
+
+	/* flush/close file and destroy old muxer */
+	buffered_file_serializer_free(&out->serializer);
+	mp4_mux_destroy(out->muxer);
+
+	for (size_t i = 0; i < out->chapters.num; i++)
+		bfree(out->chapters.array[i].name);
+
+	da_clear(out->chapters);
+
+	info("MP4 file split complete. Finalization took %" PRIu64 " ms.",
+	     (os_gettime_ns() - start_time) / 1000000);
+
+	/* open new file */
+	generate_filename(out, &out->path, out->allow_overwrite);
+	info("Changing output file to '%s'", out->path.array);
+
+	if (!buffered_file_serializer_init_defaults(&out->serializer,
+						    out->path.array)) {
+		warn("Unable to open MP4 file '%s'", out->path.array);
+		return false;
+	}
+
+	out->muxer = mp4_mux_create(out->output, &out->serializer, out->flags);
+
+	calldata_t cd = {0};
+	signal_handler_t *sh = obs_output_get_signal_handler(out->output);
+	calldata_set_string(&cd, "next_file", out->path.array);
+	signal_handler_signal(sh, "file_changed", &cd);
+	calldata_free(&cd);
+
+	out->cur_size = 0;
+	out->start_time = pkt->dts_usec;
+	ts_offset_clear(out);
+
+	return true;
+}
+
+static void mp4_output_stop(void *data, uint64_t ts)
+{
+	struct mp4_output *out = data;
+	out->stop_ts = ts / 1000;
+	os_atomic_set_bool(&out->stopping, true);
+}
+
+static void mp4_output_actual_stop(struct mp4_output *out, int code)
+{
+	os_atomic_set_bool(&out->active, false);
+
+	uint64_t start_time = os_gettime_ns();
+
+	for (size_t i = 0; i < out->chapters.num; i++) {
+		struct chapter *chap = &out->chapters.array[i];
+		mp4_mux_add_chapter(out->muxer, chap->dts_usec, chap->name);
+	}
+
+	mp4_mux_finalise(out->muxer);
+
+	if (code) {
+		obs_output_signal_stop(out->output, code);
+	} else {
+		obs_output_end_data_capture(out->output);
+	}
+
+	info("Waiting for file writer to finish...");
+
+	/* Flush/close output file and destroy muxer */
+	buffered_file_serializer_free(&out->serializer);
+	mp4_mux_destroy(out->muxer);
+	out->muxer = NULL;
+
+	/* Clear chapter data */
+	for (size_t i = 0; i < out->chapters.num; i++)
+		bfree(out->chapters.array[i].name);
+
+	da_clear(out->chapters);
+
+	info("MP4 file output complete. Finalization took %" PRIu64 " ms.",
+	     (os_gettime_ns() - start_time) / 1000000);
+}
+
+static void push_back_packet(struct mp4_output *out,
+			     struct encoder_packet *packet)
+{
+	struct encoder_packet pkt;
+	obs_encoder_packet_ref(&pkt, packet);
+	da_push_back(out->split_buffer, &pkt);
+}
+
+static inline bool submit_packet(struct mp4_output *out,
+				 struct encoder_packet *pkt)
+{
+	out->total_bytes += pkt->size;
+
+	if (!out->split_file_enabled)
+		return mp4_mux_submit_packet(out->muxer, pkt);
+
+	out->cur_size += pkt->size;
+
+	/* Apply DTS/PTS offset local packet copy */
+	struct encoder_packet modified = *pkt;
+
+	if (modified.type == OBS_ENCODER_VIDEO) {
+		modified.dts -= out->video_pts_offsets[modified.track_idx];
+		modified.pts -= out->video_pts_offsets[modified.track_idx];
+	} else {
+		modified.dts -= out->audio_dts_offsets[modified.track_idx];
+		modified.pts -= out->audio_dts_offsets[modified.track_idx];
+	}
+
+	return mp4_mux_submit_packet(out->muxer, &modified);
+}
+
+static void mp4_output_packet(void *data, struct encoder_packet *packet)
+{
+	struct mp4_output *out = data;
+
+	pthread_mutex_lock(&out->mutex);
+
+	if (!active(out))
+		goto unlock;
+
+	if (!packet) {
+		mp4_output_actual_stop(out, OBS_OUTPUT_ENCODE_ERROR);
+		goto unlock;
+	}
+
+	if (stopping(out)) {
+		if (packet->sys_dts_usec >= (int64_t)out->stop_ts) {
+			mp4_output_actual_stop(out, 0);
+			goto unlock;
+		}
+	}
+
+	if (out->split_file_enabled) {
+		if (out->split_buffer.num) {
+			int64_t pts_usec = packet_pts_usec(packet);
+			struct encoder_packet *first_pkt =
+				out->split_buffer.array;
+			int64_t first_pts_usec = packet_pts_usec(first_pkt);
+
+			if (pts_usec >= first_pts_usec) {
+				if (packet->type != OBS_ENCODER_AUDIO) {
+					push_back_packet(out, packet);
+					goto unlock;
+				}
+
+				if (!change_file(out, first_pkt)) {
+					mp4_output_actual_stop(
+						out, OBS_OUTPUT_ERROR);
+					goto unlock;
+				}
+				out->split_file_ready = true;
+			}
+		} else if (should_split(out, packet)) {
+			push_back_packet(out, packet);
+			goto unlock;
+		}
+	}
+
+	if (out->split_file_ready) {
+		for (size_t i = 0; i < out->split_buffer.num; i++) {
+			struct encoder_packet *pkt =
+				&out->split_buffer.array[i];
+			ts_offset_update(out, pkt);
+			submit_packet(out, pkt);
+			obs_encoder_packet_release(pkt);
+		}
+
+		da_free(out->split_buffer);
+		out->split_file_ready = false;
+		os_atomic_set_bool(&out->manual_split, false);
+	}
+
+	if (out->split_file_enabled)
+		ts_offset_update(out, packet);
+
+	/* Update PTS for chapter markers */
+	if (packet->type == OBS_ENCODER_VIDEO && packet->track_idx == 0)
+		out->last_dts_usec = packet->dts_usec - out->start_time;
+
+	submit_packet(out, packet);
+
+	if (serializer_get_pos(&out->serializer) == -1)
+		mp4_output_actual_stop(out, OBS_OUTPUT_ERROR);
+
+unlock:
+	pthread_mutex_unlock(&out->mutex);
+}
+
+static obs_properties_t *mp4_output_properties(void *unused)
+{
+	UNUSED_PARAMETER(unused);
+
+	obs_properties_t *props = obs_properties_create();
+
+	obs_properties_add_text(props, "path",
+				obs_module_text("MP4Output.FilePath"),
+				OBS_TEXT_DEFAULT);
+	obs_properties_add_text(props, "muxer_settings", "muxer_settings",
+				OBS_TEXT_DEFAULT);
+	return props;
+}
+
+uint64_t mp4_output_total_bytes(void *data)
+{
+	struct mp4_output *out = data;
+	return out->total_bytes;
+}
+
+struct obs_output_info mp4_output_info = {
+	.id = "mp4_output",
+	.flags = OBS_OUTPUT_AV | OBS_OUTPUT_ENCODED |
+		 OBS_OUTPUT_MULTI_TRACK_AV | OBS_OUTPUT_CAN_PAUSE,
+	.encoded_video_codecs = "h264;hevc;av1",
+	.encoded_audio_codecs = "aac",
+	.get_name = mp4_output_name,
+	.create = mp4_output_create,
+	.destroy = mp4_output_destory,
+	.start = mp4_output_start,
+	.stop = mp4_output_stop,
+	.encoded_packet = mp4_output_packet,
+	.get_properties = mp4_output_properties,
+	.get_total_bytes = mp4_output_total_bytes,
+};

+ 2 - 0
plugins/obs-outputs/obs-outputs.c

@@ -15,6 +15,7 @@ MODULE_EXPORT const char *obs_module_description(void)
 extern struct obs_output_info rtmp_output_info;
 extern struct obs_output_info null_output_info;
 extern struct obs_output_info flv_output_info;
+extern struct obs_output_info mp4_output_info;
 #if defined(FTL_FOUND)
 extern struct obs_output_info ftl_output_info;
 #endif
@@ -65,6 +66,7 @@ bool obs_module_load(void)
 	obs_register_output(&rtmp_output_info);
 	obs_register_output(&null_output_info);
 	obs_register_output(&flv_output_info);
+	obs_register_output(&mp4_output_info);
 #if defined(FTL_FOUND)
 	obs_register_output(&ftl_output_info);
 #endif