/****************************************************************************** Copyright (C) 2024 by Dennis Sädtler This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ******************************************************************************/ #include "mp4-mux-internal.h" #include "rtmp-hevc.h" #include "rtmp-av1.h" #include #include #include #include #include #include #include /* * (Mostly) compliant MP4 muxer for fun and profit. * Based on ISO/IEC 14496-12 and FFmpeg's libavformat/movenc.c ([L]GPL) * * Specification section numbers are noted where applicable. * Standard identifier is included if not referring to ISO/IEC 14496-12. */ #define do_log(level, format, ...) \ blog(level, "[%s muxer: '%s'] " format, mux->flavor == FLAVOR_MOV ? "mov" : "mp4", \ obs_output_get_name(mux->output), ##__VA_ARGS__) #define warn(format, ...) do_log(LOG_WARNING, format, ##__VA_ARGS__) #define info(format, ...) do_log(LOG_INFO, format, ##__VA_ARGS__) /* Helper to overwrite placeholder size and return total size. */ static inline size_t write_box_size(struct serializer *s, int64_t start) { int64_t end = serializer_get_pos(s); size_t size = end - start; serializer_seek(s, start, SERIALIZE_SEEK_START); s_wb32(s, (uint32_t)size); serializer_seek(s, end, SERIALIZE_SEEK_START); return size; } /// 4.2 Box header with size and char[4] name static inline void write_box(struct serializer *s, const size_t size, const char name[4]) { if (size <= UINT32_MAX) { s_wb32(s, (uint32_t)size); // size s_write(s, name, 4); // boxtype } else { s_wb32(s, 1); // size s_write(s, name, 4); // boxtype s_wb64(s, size); // largesize } } /// 4.2 FullBox extended header with u8 version and u24 flags static inline void write_fullbox(struct serializer *s, const size_t size, const char name[4], uint8_t version, uint32_t flags) { write_box(s, size, name); s_w8(s, version); s_wb24(s, flags); } /// 4.3 File Type Box static size_t mp4_write_ftyp(struct mp4_mux *mux, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "ftyp"); if (mux->flavor == FLAVOR_MOV) { /* For MOV, the brand is just "qt" followed by two spaces. */ s_write(s, "qt ", 4); // major brand s_wb32(s, 0x20140200); // minor version (BCD YYYYMM00 per QTFF spec) s_write(s, "qt ", 4); // minor brand } else { const char *major_brand = "isom"; /* Following FFmpeg's example, when using negative CTS the major brand * needs to be either iso4 or iso6 depending on whether the file is * currently fragmented. */ if (mux->flags & MP4_USE_NEGATIVE_CTS) major_brand = fragmented ? "iso6" : "iso4"; s_write(s, major_brand, 4); // major brand s_wb32(s, 0); // minor version s_write(s, major_brand, 4); // minor brands (first one matches major brand) /* Write isom base brand if it's not the major brand */ if (strcmp(major_brand, "isom") != 0) s_write(s, "isom", 4); /* Avoid adding newer brand (iso6) unless necessary, use "obs1" brand * as a placeholder to maintain ftyp box size. */ if (fragmented && strcmp(major_brand, "iso6") != 0) s_write(s, "iso6", 4); else s_write(s, "obs1", 4); s_write(s, "iso2", 4); /* Include H.264 brand if used */ for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; if (track->type == TRACK_VIDEO) { if (track->codec == CODEC_H264) s_write(s, "avc1", 4); break; } } /* General MP4 brannd */ s_write(s, "mp41", 4); } return write_box_size(s, start); } /// 8.1.2 Free Space Box static size_t mp4_write_free(struct mp4_mux *mux) { struct serializer *s = mux->serializer; /* Write a 16-byte free box, so it can be replaced with a 64-bit size * box header (u32 + char[4] + u64) */ s_wb32(s, 16); s_write(s, mux->flavor == FLAVOR_MOV ? "wide" : "free", 4); s_wb64(s, 0); return 16; } /// 8.2.2 Movie Header Box static size_t mp4_write_mvhd(struct mp4_mux *mux) { struct serializer *s = mux->serializer; size_t start = serializer_get_pos(s); /* Use primary video track as the baseline for duration */ uint64_t duration = 0; for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; if (track->type == TRACK_VIDEO) { duration = util_mul_div64(track->duration, 1000, track->timebase_den); break; } } bool extended_ts = duration > UINT32_MAX || mux->creation_time > UINT32_MAX; uint8_t version = extended_ts ? 1 : 0; write_fullbox(s, 0, "mvhd", version, 0); if (extended_ts) { s_wb64(s, mux->creation_time); // creation time s_wb64(s, mux->creation_time); // modification time s_wb32(s, 1000); // timescale s_wb64(s, duration); // duration (0 for fragmented) } else { s_wb32(s, (uint32_t)mux->creation_time); // creation time s_wb32(s, (uint32_t)mux->creation_time); // modification time s_wb32(s, 1000); // timescale s_wb32(s, (uint32_t)duration); // duration (0 for fragmented) } s_wb32(s, 0x00010000); // rate, 16.16 fixed float (1 << 16) s_wb16(s, 0x0100); // volume s_wb16(s, 0); // reserved s_wb32(s, 0); // reserved s_wb32(s, 0); // reserved // Matrix for (int i = 0; i < 9; i++) s_wb32(s, UNITY_MATRIX[i]); // pre_defined s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, mux->track_ctr + 1); // next_track_ID return write_box_size(s, start); } /// 8.3.2 Track Header Box static size_t mp4_write_tkhd(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; size_t start = serializer_get_pos(s); uint64_t duration = util_mul_div64(track->duration, 1000, track->timebase_den); bool extended_ts = duration > UINT32_MAX || mux->creation_time > UINT32_MAX; uint8_t version = extended_ts ? 1 : 0; /* Flags are 0x1 (enabled) | 0x2 (in movie) */ static const uint32_t flags = 0x1 | 0x2; write_fullbox(s, 0, "tkhd", version, flags); if (extended_ts) { s_wb64(s, mux->creation_time); // creation time s_wb64(s, mux->creation_time); // modification time s_wb32(s, track->track_id); // track_id s_wb32(s, 0); // reserved s_wb64(s, duration); // duration in movie timescale } else { s_wb32(s, (uint32_t)mux->creation_time); // creation time s_wb32(s, (uint32_t)mux->creation_time); // modification time s_wb32(s, track->track_id); // track_id s_wb32(s, 0); // reserved s_wb32(s, (uint32_t)duration); // duration in movie timescale } s_wb32(s, 0); // reserved s_wb32(s, 0); // reserved s_wb16(s, 0); // layer s_wb16(s, track->type == TRACK_AUDIO ? 1 : 0); // alternate group s_wb16(s, track->type == TRACK_AUDIO ? 0x100 : 0); // volume s_wb16(s, 0); // reserved // Matrix (predefined) for (int i = 0; i < 9; i++) s_wb32(s, UNITY_MATRIX[i]); if (track->type == TRACK_AUDIO) { s_wb32(s, 0); // width s_wb32(s, 0); // height } else { /* width/height are fixed point 16.16, so we just shift the * integer to the upper 16 bits */ uint32_t width = obs_encoder_get_width(track->encoder); s_wb32(s, width << 16); uint32_t height = obs_encoder_get_height(track->encoder); s_wb32(s, height << 16); } return write_box_size(s, start); } /// 8.4.2 Media Header Box static size_t mp4_write_mdhd(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; size_t size = 32; uint8_t version = 0; uint64_t duration = track->duration; uint32_t timescale = track->timescale; if (track->type == TRACK_VIDEO) { /* Update to track timescale */ duration = util_mul_div64(duration, track->timescale, track->timebase_den); } /* use 64-bit duration if necessary */ if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) { if (mux->flavor == FLAVOR_MOV) { /* QTFF does not specify how to handle 32-bit overflow for duration/timestamps. */ warn("Duration too large for MOV, this file may be unplayable in QuickTime!"); } size = 44; version = 1; } write_fullbox(s, size, "mdhd", version, 0); if (version == 1) { s_wb64(s, mux->creation_time); // creation time s_wb64(s, mux->creation_time); // modification time s_wb32(s, timescale); // timescale s_wb64(s, (uint32_t)duration); // duration } else { s_wb32(s, (uint32_t)mux->creation_time); // creation time s_wb32(s, (uint32_t)mux->creation_time); // modification time s_wb32(s, timescale); // timescale s_wb32(s, (uint32_t)duration); // duration } s_wb16(s, mux->flavor == FLAVOR_MOV ? 32767 : 21956); // language (undefined) s_wb16(s, 0); // pre_defined return size; } /// 8.4.3 Handler Reference Box static size_t mp4_write_hdlr(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "hdlr", 0, 0); if (mux->flavor == FLAVOR_MOV) s_write(s, track ? "mhlr" : "dhlr", 4); else s_wb32(s, 0); // pre_defined // handler_type if (!track) s_write(s, "url ", 4); else if (track->type == TRACK_VIDEO) s_write(s, "vide", 4); else if (track->type == TRACK_CHAPTERS) s_write(s, "text", 4); else s_write(s, "soun", 4); s_wb32(s, 0); // reserved s_wb32(s, 0); // reserved s_wb32(s, 0); // reserved const char *handler_name; if (!track) handler_name = "OBS Data Handler"; else if (track->type == TRACK_VIDEO) handler_name = "OBS Video Handler"; else if (track->type == TRACK_CHAPTERS) handler_name = "OBS Chapter Handler"; else handler_name = "OBS Audio Handler"; // name (null-terminated for MP4, pascal string for MOV) size_t handler_len = strlen(handler_name); if (mux->flavor == FLAVOR_MOV) { s_w8(s, (uint8_t)handler_len); s_write(s, handler_name, handler_len); } else { s_write(s, handler_name, handler_len); s_w8(s, 0); // NULL terminator } return write_box_size(s, start); } /// 12.1.2 Video media header static size_t mp4_write_vmhd(struct mp4_mux *mux) { struct serializer *s = mux->serializer; /* Flags is always 1 */ write_fullbox(s, 20, "vmhd", 0, 1); s_wb16(s, 0); // graphicsmode s_wb16(s, 0); // opcolor r s_wb16(s, 0); // opcolor g s_wb16(s, 0); // opcolor b return 16; } /// 12.2.2 Sound media header static size_t mp4_write_smhd(struct mp4_mux *mux) { struct serializer *s = mux->serializer; write_fullbox(s, 16, "smhd", 0, 0); s_wb16(s, 0); // balance s_wb16(s, 0); // reserved return 16; } /// (QTFF/Apple) Text media information atom static size_t mp4_write_qt_text(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "text"); /* Identity matrix, note that it's not fixed point 16.16 */ s_wb16(s, 0x01); s_wb32(s, 0x00); s_wb32(s, 0x00); s_wb32(s, 0x00); s_wb32(s, 0x01); s_wb32(s, 0x00); s_wb32(s, 0x00); s_wb32(s, 0x00); s_wb32(s, 0x00004000); /* Seemingly undocumented */ s_wb16(s, 0x0000); return write_box_size(s, start); } /// (QTFF/Apple) Base media info atom static size_t mp4_write_gmin(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "gmin", 0, 0); s_wb16(s, 0x40); // graphics mode s_wb16(s, 0x8000); // opColor r s_wb16(s, 0x8000); // opColor g s_wb16(s, 0x8000); // opColor b s_wb16(s, 0); // balance s_wb16(s, 0); // reserved return write_box_size(s, start); } /// (QTFF/Apple) Base media information header atom static size_t mp4_write_gmhd(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "gmhd"); // gmin mp4_write_gmin(mux); // text (QuickTime) mp4_write_qt_text(mux); return write_box_size(s, start); } /// ISO/IEC 14496-15 5.4.2.1 AVCConfigurationBox static size_t mp4_write_avcC(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; /* For AVC this is the parsed extra data. */ uint8_t *header; size_t size; struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO, .timebase_den = 1, .keyframe = true}; if (!obs_encoder_get_extra_data(enc, &header, &size)) return 0; packet.size = obs_parse_avc_header(&packet.data, header, size); size_t box_size = packet.size + 8; write_box(s, box_size, "avcC"); s_write(s, packet.data, packet.size); bfree(packet.data); return box_size; } /// ISO/IEC 14496-15 8.4.1.1 HEVCConfigurationBox static size_t mp4_write_hvcC(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; /* For HEVC this is the parsed extra data. */ uint8_t *header; size_t size; struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO, .timebase_den = 1, .keyframe = true}; if (!obs_encoder_get_extra_data(enc, &header, &size)) return 0; packet.size = obs_parse_hevc_header(&packet.data, header, size); size_t box_size = packet.size + 8; write_box(s, box_size, "hvcC"); s_write(s, packet.data, packet.size); bfree(packet.data); return box_size; } /// AV1 ISOBMFF 2.3. AV1 Codec Configuration Box static size_t mp4_write_av1C(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; /* For AV1 this is just the parsed extra data. */ uint8_t *header; size_t size; struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO, .timebase_den = 1, .keyframe = true}; if (!obs_encoder_get_extra_data(enc, &header, &size)) return 0; packet.size = obs_parse_av1_header(&packet.data, header, size); size_t box_size = packet.size + 8; write_box(s, box_size, "av1C"); s_write(s, packet.data, packet.size); bfree(packet.data); return box_size; } /// 12.1.5 Colour information static size_t mp4_write_colr(struct mp4_mux *mux, obs_encoder_t *enc) { UNUSED_PARAMETER(enc); struct serializer *s = mux->serializer; write_box(s, 19, "colr"); uint8_t full_range = 0; uint16_t pri, trc, spc; pri = trc = spc = 0; get_colour_information(enc, &pri, &trc, &spc, &full_range); s_write(s, "nclx", 4); // colour_type s_wb16(s, pri); // colour_primaries s_wb16(s, trc); // transfer_characteristics s_wb16(s, spc); // matrix_coefficiencts s_w8(s, full_range << 7); // full range flag + 7 reserved bits (0) return 19; } /// 12.1.4 Pixel Aspect Ratio static size_t mp4_write_pasp(struct mp4_mux *mux) { struct serializer *s = mux->serializer; write_box(s, 16, "pasp"); s_wb32(s, 1); // hSpacing s_wb32(s, 1); // vSpacing return 16; } /// 12.1.3 Visual Sample Entry static inline void mp4_write_visual_sample_entry(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; // SampleEntry Box s_w8(s, 0); // reserved s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_wb16(s, 1); // data_reference_index // VisualSampleEntry Box s_wb16(s, 0); // pre_defined s_wb16(s, 0); // reserved if (mux->flavor == FLAVOR_MOV) { s_write(s, "OBSS", 4); // vendor s_wb32(s, 0x200); // temporal quality (codecNormalQuality = 512) s_wb32(s, 0x200); // spatial quality (codecNormalQuality) } else { s_wb32(s, 0); // pre_defined s_wb32(s, 0); // pre_defined s_wb32(s, 0); // pre_defined } s_wb16(s, (uint16_t)obs_encoder_get_width(enc)); // width s_wb16(s, (uint16_t)obs_encoder_get_height(enc)); // height s_wb32(s, 0x00480000); // horizresolution (predefined) s_wb32(s, 0x00480000); // vertresolution (predefined) s_wb32(s, 0); // reserved s_wb16(s, 1); // frame_count /* Name is fixed 32-bytes and needs to be padded to that length. * First byte is the length, rest is a string sans NULL terminator. */ char compressor_name[32] = {0}; const char *enc_id = obs_encoder_get_id(enc); if (enc_id) { size_t len = strlen(enc_id); if (len > 31) len = 31; compressor_name[0] = (char)len; memcpy(compressor_name + 1, enc_id, len); } s_write(s, compressor_name, sizeof(compressor_name)); // compressorname s_wb16(s, 0x0018); // depth s_wb16(s, -1); // pre_defined } /// 12.1.6 Content light level static size_t mp4_write_clli(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; video_t *video = obs_encoder_video(enc); const struct video_output_info *info = video_output_get_info(video); /* Only write box for HDR video */ if (info->colorspace != VIDEO_CS_2100_PQ && info->colorspace != VIDEO_CS_2100_HLG) return 0; write_box(s, 12, "clli"); float nominal_peak = obs_get_video_hdr_nominal_peak_level(); s_wb16(s, (uint16_t)nominal_peak); // max_content_light_level s_wb16(s, (uint16_t)nominal_peak); // max_pic_average_light_level return 12; } /// 12.1.7 Mastering display colour volume static size_t mp4_write_mdcv(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; video_t *video = obs_encoder_video(enc); const struct video_output_info *info = video_output_get_info(video); // Only write atom for HDR video if (info->colorspace != VIDEO_CS_2100_PQ && info->colorspace != VIDEO_CS_2100_HLG) return 0; write_box(s, 32, "mdcv"); float nominal_peak = obs_get_video_hdr_nominal_peak_level(); uint32_t max_lum = (uint32_t)nominal_peak * 10000; /* Note that these values are hardcoded everywhere in OBS, so these are * just the same as used in our other muxers/encoders. */ // 3 x display_primaries (x, y) pairs s_wb16(s, 13250); s_wb16(s, 34500); s_wb16(s, 7500); s_wb16(s, 3000); s_wb16(s, 34000); s_wb16(s, 16000); s_wb16(s, 15635); // white_point_x s_wb16(s, 16450); // white_point_y s_wb32(s, max_lum); // max_display_mastering_luminance s_wb32(s, 0); // min_display_mastering_luminance return 32; } /// ISO/IEC 14496-15 5.4.2.1 AVCSampleEntry static size_t mp4_write_avc1(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "avc1"); mp4_write_visual_sample_entry(mux, enc); // avcC mp4_write_avcC(mux, enc); // colr mp4_write_colr(mux, enc); // pasp mp4_write_pasp(mux); return write_box_size(s, start); } /// ISO/IEC 14496-15 8.4.1.1 HEVCSampleEntry static size_t mp4_write_hvc1(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "hvc1"); mp4_write_visual_sample_entry(mux, enc); // avcC mp4_write_hvcC(mux, enc); // colr mp4_write_colr(mux, enc); // clli mp4_write_clli(mux, enc); // mdcv mp4_write_mdcv(mux, enc); // pasp mp4_write_pasp(mux); return write_box_size(s, start); } /// AV1 ISOBMFF 2.2. AV1 Sample Entry static size_t mp4_write_av01(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "av01"); mp4_write_visual_sample_entry(mux, enc); // avcC mp4_write_av1C(mux, enc); // colr mp4_write_colr(mux, enc); // clli mp4_write_clli(mux, enc); // mdcv mp4_write_mdcv(mux, enc); // pasp mp4_write_pasp(mux); return write_box_size(s, start); } /// (QTFF/Apple) Video Sample Description static size_t mp4_write_prores(struct mp4_mux *mux, obs_encoder_t *enc) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /* We get the tag as an int, but need it as a char[4] */ union tag { char c[4]; uint32_t i; } codec_tag; /* Codec tag varies for ProRes depending on configuration, so we need to get it from the encoder. */ obs_data_t *settings = obs_encoder_get_settings(enc); codec_tag.i = (uint32_t)obs_data_get_int(settings, "codec_type"); obs_data_release(settings); #if __BYTE_ORDER == __LITTLE_ENDIAN codec_tag.i = ((codec_tag.i >> 24) & 0x000000FF) | ((codec_tag.i << 8) & 0x00FF0000) | ((codec_tag.i >> 8) & 0x0000FF00) | ((codec_tag.i << 24) & 0xFF000000); #endif write_box(s, 0, codec_tag.c); mp4_write_visual_sample_entry(mux, enc); // colr mp4_write_colr(mux, enc); // clli mp4_write_clli(mux, enc); // mdcv mp4_write_mdcv(mux, enc); // pasp mp4_write_pasp(mux); return write_box_size(s, start); } static inline void put_descr(struct serializer *s, uint8_t tag, size_t size) { int i = 3; s_w8(s, tag); for (; i > 0; i--) s_w8(s, (uint8_t)((size >> (7 * i)) | 0x80)); s_w8(s, size & 0x7F); } /// ISO/IEC 14496-14 5.6 ESDBox static size_t mp4_write_esds(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "esds", 0, 0); /* Encoder extradata will be used as DecoderSpecificInfo */ uint8_t *extradata; size_t extradata_size; if (!obs_encoder_get_extra_data(track->encoder, &extradata, &extradata_size)) { extradata_size = 0; } /// ISO/IEC 14496-1 // ES_Descriptor size_t decoder_specific_info_len = extradata_size ? extradata_size + 5 : 0; put_descr(s, 0x03, 3 + 5 + 13 + decoder_specific_info_len + 5 + 1); s_wb16(s, track->track_id); s_w8(s, 0x00); // flags // DecoderConfigDescriptor put_descr(s, 0x04, 13 + decoder_specific_info_len); s_w8(s, 0x40); // codec tag, 0x40 = AAC s_w8(s, 0x15); // stream type field (0x15 = audio stream) /* When writing the final MOOV this could theoretically be calculated * based on chunks, but it's not really all that important. */ uint32_t bitrate = 0; obs_data_t *settings = obs_encoder_get_settings(track->encoder); if (settings) { int64_t enc_bitrate = obs_data_get_int(settings, "bitrate"); if (enc_bitrate) bitrate = (uint32_t)(enc_bitrate * 1000); obs_data_release(settings); } s_wb24(s, 0); // bufferSizeDB (in bytes) s_wb32(s, bitrate); // maxbitrate s_wb32(s, bitrate); // avgBitrate // DecoderSpecificInfo if (extradata_size) { put_descr(s, 0x05, extradata_size); s_write(s, extradata, extradata_size); } // SLConfigDescriptor descriptor put_descr(s, 0x06, 1); s_w8(s, 0x02); // 0x2 = reserved for MP4, descriptor is empty return write_box_size(s, start); } /// 12.2.3 Audio Sample Entry static inline void mp4_write_audio_sample_entry(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; bool is_mov = mux->flavor == FLAVOR_MOV; bool is_pcm = track->codec == CODEC_PCM_I16 || track->codec == CODEC_PCM_I24 || track->codec == CODEC_PCM_F32; // SampleEntry Box s_w8(s, 0); // reserved s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_w8(s, 0); s_wb16(s, 1); // data_reference_index // AudioSampleEntry Box s_wb16(s, version); // entry_version s_wb16(s, 0); // reserved s_wb16(s, 0); // reserved s_wb16(s, 0); // reserved audio_t *audio = obs_encoder_audio(track->encoder); uint32_t channels = (uint32_t)audio_output_get_channels(audio); uint32_t sample_rate = track->timescale; bool alac = track->codec == CODEC_ALAC; /* MOV specific version: https://developer.apple.com/documentation/quicktime-file-format/sound_sample_description_version_2 */ if (version == 2) { // We need to get the raw float bytes, union seems to be the easiest way to do that. union rate { uint64_t u; double f; } rate; rate.f = (double)sample_rate; s_wb16(s, 3); // always3 s_wb16(s, 16); // always16 s_wb16(s, 0xfffe); // alwaysMinus2 s_wb16(s, 0); // always0 s_wb32(s, 0x00010000); // always65536 s_wb32(s, 72); // sizeOfStructOnly (start of containing box to constLPCMFramesPerAudioPacket) s_wb64(s, rate.u); // audioSampleRate s_wb32(s, channels); // numAudioChannels s_wb32(s, 0x7F000000); // always7F000000 s_wb32(s, is_pcm ? track->sample_size / channels * 8 : 0); // constBitsPerChannel s_wb32(s, get_lpcm_flags(track->codec)); // formatSpecificFlags s_wb32(s, is_pcm ? track->sample_size : 0); // constBytesPerAudioPacket s_wb32(s, is_pcm ? 1 : 0); // constLPCMFramesPerAudioPacket } else { s_wb16(s, channels); // channelcount /* OBS FLAC is currently always 16-bit, ALAC always 24, this may change in the future and should be * handled differently then. * That being said those codecs are self-describing, so in most cases it shouldn't actually matter. */ s_wb16(s, !is_mov && alac ? 24 : 16); // samplesize s_wb16(s, is_mov && !is_pcm ? -2 : 0); // pre_defined (compression ID in MOV) s_wb16(s, 0); // reserved /* The sample rate field is limited to 16-bits. Technically version 1 supports a "srat" box which * provides 32-bits, but this is not supported by most software (including FFmpeg and Chromium). * For encoded codecs (AAC etc.), the sample rate can be read from the encoded data itself. * For PCM FFmpeg will try to use the timescale as sample rate. */ if (sample_rate > UINT16_MAX) { warn("Sample rate too high for MP4, file may not play back correctly."); sample_rate = 0; } s_wb32(s, sample_rate << 16); // samplerate /* MOV-only data: https://developer.apple.com/documentation/quicktime-file-format/sound_sample_description_version_1 */ if (is_mov && version == 1) { size_t frame_size = obs_encoder_get_frame_size(track->encoder); s_wb32(s, is_pcm ? 1 : (uint32_t)frame_size); // frame size s_wb32(s, is_pcm ? track->sample_size / channels : 0); // bytes per packet s_wb32(s, is_pcm ? track->sample_size : 0); // bytes per frame s_wb32(s, 2); // bytes per sample, 2 for anything but 8-bit } } } /// 12.2.4 Channel layout static size_t mp4_write_chnl(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "chnl", 0, 0); audio_t *audio = obs_encoder_audio(track->encoder); const struct audio_output_info *info = audio_output_get_info(audio); s_w8(s, 1); // stream_structure (1 = channels) /* 5.1 and 4.1 do not have a corresponding ISO layout, so we have to * write a manually created channel map for those. */ uint8_t map[8] = {0}; uint8_t items = 0; uint8_t defined_layout = 0; get_speaker_positions(info->speakers, map, &items, &defined_layout); if (!defined_layout) { warn("No ISO layout available for speaker layout %d, " "this may not be supported by all applications!", info->speakers); s_w8(s, 0); // definedLayout s_write(s, map, items); // uint8_t speaker_position[count] } else { s_w8(s, defined_layout); // definedLayout s_wb64(s, 0); // ommitedChannelMap } return write_box_size(s, start); } /// ISO/IEC 14496-14 5.6 MP4AudioSampleEntry static size_t mp4_write_mp4a(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "mp4a"); mp4_write_audio_sample_entry(mux, track, version); // esds mp4_write_esds(mux, track); /* Write channel layout for version 1 sample entires */ if (version == 1) mp4_write_chnl(mux, track); return write_box_size(s, start); } /// Encapsulation of FLAC in ISO Base Media File Format 3.3.2 FLAC Specific Box static size_t mp4_write_dfLa(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint8_t *extradata; size_t extradata_size; if (!obs_encoder_get_extra_data(track->encoder, &extradata, &extradata_size)) return 0; write_fullbox(s, 0, "dfLa", 0, 0); /// FLACMetadataBlock // LastMetadataBlockFlag (1) | BlockType (0) s_w8(s, 1 << 7 | 0); // Length s_wb24(s, (uint32_t)extradata_size); // BlockData[Length] s_write(s, extradata, extradata_size); return write_box_size(s, start); } /// Encapsulation of FLAC in ISO Base Media File Format 3.3.1 FLACSampleEntry static size_t mp4_write_fLaC(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "fLaC"); mp4_write_audio_sample_entry(mux, track, version); // dfLa mp4_write_dfLa(mux, track); if (version == 1) mp4_write_chnl(mux, track); return write_box_size(s, start); } /// Apple Lossless Format "Magic Cookie" Description - MP4/M4A File static size_t mp4_write_alac(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint8_t *extradata; size_t extradata_size; if (!obs_encoder_get_extra_data(track->encoder, &extradata, &extradata_size)) return 0; write_box(s, 0, "alac"); mp4_write_audio_sample_entry(mux, track, version); /* Apple Lossless Magic Cookie */ s_write(s, extradata, extradata_size); if (version == 1) mp4_write_chnl(mux, track); return write_box_size(s, start); } /// ISO/IEC 23003-5 5.1 PCM configuration static size_t mp4_write_pcmc(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "pcmC", 0, 0); s_w8(s, 1); // endianness, 1 = little endian // bits per sample if (track->codec == CODEC_PCM_I16) s_w8(s, 16); else if (track->codec == CODEC_PCM_I24) s_w8(s, 24); else if (track->codec == CODEC_PCM_F32) s_w8(s, 32); return write_box_size(s, start); } /// ISO/IEC 23003-5 5.1 PCM configuration static size_t mp4_write_xpcm(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /* Different box types for floating point and integer PCM*/ write_box(s, 0, track->codec == CODEC_PCM_F32 ? "fpcm" : "ipcm"); mp4_write_audio_sample_entry(mux, track, version); /* ChannelLayout (chnl) is required for PCM */ mp4_write_chnl(mux, track); // pcmc mp4_write_pcmc(mux, track); return write_box_size(s, start); } /// (QTFF/Apple) Text sample description static size_t mp4_write_text(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "text", 0, 0); s_wb32(s, 1); // number of entries /* Preset sample description as used by FFmpeg. */ s_write(s, &TEXT_STUB_HEADER, sizeof(TEXT_STUB_HEADER)); return write_box_size(s, start); } static inline uint32_t rl32(const uint8_t *ptr) { return (ptr[3] << 24) + (ptr[2] << 16) + (ptr[1] << 8) + ptr[0]; } static inline uint16_t rl16(const uint8_t *ptr) { return (ptr[1] << 8) + ptr[0]; } /// Encapsulation of Opus in ISO Base Media File Format 4.3.2 Opus Specific Box static size_t mp4_write_dOps(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint8_t *extradata; size_t extradata_size; if (!obs_encoder_get_extra_data(track->encoder, &extradata, &extradata_size)) return 0; write_box(s, 0, "dOps"); s_w8(s, 0); // version uint8_t channels = *(extradata + 9); uint8_t channel_map = *(extradata + 18); s_w8(s, channels); // channel count // OpusHead is little-endian, but MP4 is big-endian, so we have to swap them here s_wb16(s, rl16(extradata + 10)); // pre-skip s_wb32(s, rl32(extradata + 12)); // input sample rate s_wb16(s, rl16(extradata + 16)); // output gain s_w8(s, channel_map); // channel mapping family if (channel_map) s_write(s, extradata + 19, 2 + channels); return write_box_size(s, start); } /// Encapsulation of Opus in ISO Base Media File Format 4.3.1 Sample entry format static size_t mp4_write_Opus(struct mp4_mux *mux, struct mp4_track *track, uint8_t version) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "Opus"); mp4_write_audio_sample_entry(mux, track, version); // dOps mp4_write_dOps(mux, track); if (version == 1) mp4_write_chnl(mux, track); return write_box_size(s, start); } /// (QTFF/Apple) siDecompressionParam Atom ('wave') static size_t mp4_write_wave(struct mp4_mux *mux, struct mp4_track *track, const char tag[4]) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "wave"); /* frma atom containing codec tag (again) */ s_wb32(s, 12); s_write(s, "frma", 4); s_write(s, tag, 4); if (track->codec == CODEC_AAC) { mp4_write_esds(mux, track); } else if (track->codec == CODEC_ALAC) { uint8_t *extradata; size_t extradata_size; if (obs_encoder_get_extra_data(track->encoder, &extradata, &extradata_size)) { /* Apple Lossless Magic Cookie */ s_write(s, extradata, extradata_size); } } /* Terminator atom */ s_wb32(s, 8); // size s_wb32(s, 0); // NULL name return write_box_size(s, start); } /// (QTFF/Apple) Audio Channel Layout Atom (‘chan’) static size_t mp4_write_chan(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); audio_t *audio = obs_encoder_audio(track->encoder); const struct audio_output_info *info = audio_output_get_info(audio); uint32_t layout = get_mov_channel_layout(track->codec, info->speakers); uint32_t bitmap = layout == kAudioChannelLayoutTag_UseChannelBitmap ? get_mov_channel_bitmap(info->speakers) : 0; if (layout == kAudioChannelLayoutTag_UseChannelBitmap && !bitmap) { warn("No valid speaker layout found, not writing chan box. File may not play back correctly!"); return 0; } write_fullbox(s, 0, "chan", 0, 0); /* AudioChannelLayout from CoreAudioTypes.h */ s_wb32(s, layout); // mChannelLayoutTag s_wb32(s, bitmap); // mChannelBitmap s_wb32(s, 0); // mNumberChannelDescriptions return write_box_size(s, start); } /// (QTFF/Apple) Sound Sample Description (v1 and v2) static size_t mp4_write_mov_audio_tag(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); const char *tag = NULL; audio_t *audio = obs_encoder_audio(track->encoder); uint32_t sample_rate = audio_output_get_sample_rate(audio); size_t channels = audio_output_get_channels(audio); /* More than 2 channels or samples rates above 65535 Hz requires v2 */ uint8_t version = (channels > 2 || sample_rate > UINT16_MAX) ? 2 : 1; if (track->codec == CODEC_PCM_F32 || track->codec == CODEC_PCM_I16 || track->codec == CODEC_PCM_I24) { tag = "lpcm"; version = 2; /* lpcm also requires v2 */ } else if (track->codec == CODEC_AAC) { tag = "mp4a"; } else if (track->codec == CODEC_ALAC) { tag = "alac"; } /* Unsupported/Unknown codec */ if (!tag) return 0; write_box(s, 0, tag); mp4_write_audio_sample_entry(mux, track, version); // wave if (version == 1) mp4_write_wave(mux, track, tag); // chan mp4_write_chan(mux, track); return write_box_size(s, start); } /// 8.5.2 Sample Description Box static size_t mp4_write_stsd(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /* Anything but mono or stereo technically requires v1, * but in practice that doesn't appear to matter. */ uint8_t version = 0; if (track->type == TRACK_AUDIO && mux->flavor != FLAVOR_MOV) { audio_t *audio = obs_encoder_audio(track->encoder); version = audio_output_get_channels(audio) > 2 ? 1 : 0; } write_fullbox(s, 0, "stsd", version, 0); s_wb32(s, 1); // entry_count // codec specific boxes if (track->type == TRACK_VIDEO) { if (track->codec == CODEC_H264) mp4_write_avc1(mux, track->encoder); else if (track->codec == CODEC_HEVC) mp4_write_hvc1(mux, track->encoder); else if (track->codec == CODEC_AV1) mp4_write_av01(mux, track->encoder); else if (track->codec == CODEC_PRORES) mp4_write_prores(mux, track->encoder); } else if (track->type == TRACK_AUDIO) { if (mux->flavor == FLAVOR_MOV) { mp4_write_mov_audio_tag(mux, track); } else { if (track->codec == CODEC_AAC) mp4_write_mp4a(mux, track, version); else if (track->codec == CODEC_OPUS) mp4_write_Opus(mux, track, version); else if (track->codec == CODEC_FLAC) mp4_write_fLaC(mux, track, version); else if (track->codec == CODEC_ALAC) mp4_write_alac(mux, track, version); else if (track->codec == CODEC_PCM_I16 || track->codec == CODEC_PCM_I24 || track->codec == CODEC_PCM_F32) mp4_write_xpcm(mux, track, version); } } else if (track->type == TRACK_CHAPTERS) { mp4_write_text(mux); } return write_box_size(s, start); } /// 8.6.1.2 Decoding Time to Sample Box static size_t mp4_write_stts(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; if (fragmented) { write_fullbox(s, 16, "stts", 0, 0); s_wb32(s, 0); // entry_count return 16; } int64_t start = serializer_get_pos(s); struct sample_delta *arr = track->deltas.array; size_t num = track->deltas.num; write_fullbox(s, 0, "stts", 0, 0); s_wb32(s, (uint32_t)num); // entry_count for (size_t idx = 0; idx < num; idx++) { struct sample_delta *smp = &arr[idx]; uint64_t delta = util_mul_div64(smp->delta, track->timescale, track->timebase_den); s_wb32(s, smp->count); // sample_count s_wb32(s, (uint32_t)delta); // sample_delta } return write_box_size(s, start); } /// 8.6.2 Sync Sample Box static size_t mp4_write_stss(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; uint32_t num = (uint32_t)track->sync_samples.num; if (!num) return 0; /* 16 byte FullBox header + 4-bytes (u32) per sync sample */ uint32_t size = 16 + 4 * num; write_fullbox(s, size, "stss", 0, 0); s_wb32(s, num); // entry_count for (size_t idx = 0; idx < num; idx++) s_wb32(s, track->sync_samples.array[idx]); // sample_number return size; } /// 8.6.1.3 Composition Time to Sample Box static size_t mp4_write_ctts(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; uint32_t num = (uint32_t)track->offsets.num; uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0; /* 16 byte FullBox header + 8-bytes (u32+u32/i32) per offset entry */ uint32_t size = 16 + 8 * num; write_fullbox(s, size, "ctts", version, 0); s_wb32(s, num); // entry_count for (size_t idx = 0; idx < num; idx++) { int64_t offset = (int64_t)track->offsets.array[idx].offset * (int64_t)track->timescale / (int64_t)track->timebase_den; s_wb32(s, track->offsets.array[idx].count); // sample_count s_wb32(s, (uint32_t)offset); // sample_offset } return size; } /// 8.7.4 Sample To Chunk Box static size_t mp4_write_stsc(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; if (fragmented) { write_fullbox(s, 16, "stsc", 0, 0); s_wb32(s, 0); // entry_count return 16; } struct chunk *arr = track->chunks.array; size_t arr_num = track->chunks.num; /* Compress into array with counter for repeating chunk sizes */ DARRAY(struct chunk_run { uint32_t first; uint32_t samples; }) chunk_runs; da_init(chunk_runs); for (size_t idx = 0; idx < arr_num; idx++) { struct chunk *chk = &arr[idx]; if (!chunk_runs.num || chunk_runs.array[chunk_runs.num - 1].samples != chk->samples) { struct chunk_run *cr = da_push_back_new(chunk_runs); cr->samples = chk->samples; cr->first = (uint32_t)idx + 1; // ISO-BMFF is 1-indexed } } uint32_t num = (uint32_t)chunk_runs.num; /* 16 byte FullBox header + 12-bytes (u32+u32+u32) per chunk run */ uint32_t size = 16 + 12 * num; write_fullbox(s, size, "stsc", 0, 0); s_wb32(s, num); // entry_count for (size_t idx = 0; idx < num; idx++) { struct chunk_run *cr = &chunk_runs.array[idx]; s_wb32(s, cr->first); // first_chunk s_wb32(s, cr->samples); // samples_per_chunk s_wb32(s, 1); // sample_description_index } da_free(chunk_runs); return size; } /// 8.7.3 Sample Size Boxes static size_t mp4_write_stsz(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; if (fragmented) { write_fullbox(s, 20, "stsz", 0, 0); s_wb32(s, 0); // sample_size s_wb32(s, 0); // sample_count return 20; } int64_t start = serializer_get_pos(s); /* This should only ever happen when recording > 24 hours of * 48 kHz PCM audio or 828 days of 60 FPS video. */ if (track->samples > UINT32_MAX) { warn("Track %u has too many samples, its duration may not be " "read correctly. Remuxing the file to another format such " "as MKV may be required.", track->track_id); } write_fullbox(s, 0, "stsz", 0, 0); if (track->sample_size) { /* Fixed size samples mean we don't need an array */ s_wb32(s, track->sample_size); // sample_size s_wb32(s, (uint32_t)track->samples); // sample_count } else { s_wb32(s, 0); // sample_size s_wb32(s, (uint32_t)track->sample_sizes.num); // sample_count for (size_t idx = 0; idx < track->sample_sizes.num; idx++) { s_wb32(s, track->sample_sizes.array[idx]); // entry_size } } return write_box_size(s, start); } /// 8.7.5 Chunk Offset Box static size_t mp4_write_stco(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; if (fragmented) { write_fullbox(s, 16, "stco", 0, 0); s_wb32(s, 0); // entry_count return 16; } struct chunk *arr = track->chunks.array; uint32_t num = (uint32_t)track->chunks.num; uint64_t last_off = arr[num - 1].offset; uint32_t size; bool co64 = last_off > UINT32_MAX; /* When using 64-bit offsets we write 8-bytes (u64) per chunk, * otherwise 4-bytes (u32). */ if (co64) { size = 16 + 8 * num; write_fullbox(s, size, "co64", 0, 0); } else { size = 16 + 4 * num; write_fullbox(s, size, "stco", 0, 0); } s_wb32(s, num); // entry_count for (size_t idx = 0; idx < num; idx++) { if (co64) s_wb64(s, arr[idx].offset); // chunk_offset else s_wb32(s, (uint32_t)arr[idx].offset); // chunk_offset } return size; } /// 8.9.3 Sample Group Description Box static size_t mp4_write_sgpd_aac(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "sgpd", 1, 0); s_write(s, "roll", 4); // grouping_tpye s_wb32(s, 2); // default_length (i16) s_wb32(s, 1); // entry_count // AudioRollRecoveryEntry s_wb16(s, -1); // roll_distance return write_box_size(s, start); } /// 8.9.2 Sample to Group Box static size_t mp4_write_sbgp_aac(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "sbgp", 0, 0); /// 10.1 AudioRollRecoveryEntry s_write(s, "roll", 4); // grouping_tpye s_wb32(s, 1); // entry_count s_wb32(s, (uint32_t)track->samples); // sample_count s_wb32(s, 1); // group_description_index return write_box_size(s, start); } static size_t mp4_write_sbgp_sbgp_opus(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /// 8.9.3 Sample Group Description Box write_fullbox(s, 0, "sgpd", 1, 0); s_write(s, "roll", 4); // grouping_tpye s_wb32(s, 2); // default_length (i16) /* Opus requires 80 ms of preroll, which at 48 kHz is 3840 PCM samples */ const int64_t opus_preroll = 3840; /* Compute the preroll samples (should be 4, each being 20 ms) */ uint16_t preroll_count = 0; int64_t preroll_remaining = opus_preroll; for (size_t i = 0; i < track->deltas.num && preroll_remaining > 0; i++) { for (uint32_t j = 0; j < track->deltas.array[i].count && preroll_remaining > 0; j++) { preroll_remaining -= track->deltas.array[i].delta; preroll_count++; } } s_wb32(s, 1); // entry_count /// 10.1 AudioRollRecoveryEntry s_wb16(s, -preroll_count); // roll_distance size_t size_sgpd = write_box_size(s, start); /* --------------- */ /// 8.9.2 Sample to Group Box start = serializer_get_pos(s); write_fullbox(s, 0, "sbgp", 0, 0); s_write(s, "roll", 4); // grouping_tpye s_wb32(s, 2); // entry_count // entry 0 s_wb32(s, preroll_count); // sample_count s_wb32(s, 0); // group_description_index // entry 1 s_wb32(s, (uint32_t)track->samples - preroll_count); // sample_count s_wb32(s, 1); // group_description_index return size_sgpd + write_box_size(s, start); } /// 8.5.1 Sample Table Box static size_t mp4_write_stbl(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "stbl"); // stsd mp4_write_stsd(mux, track); // stts mp4_write_stts(mux, track, fragmented); // stss (non-fragmented/non-prores only) if (track->type == TRACK_VIDEO && !fragmented && track->codec != CODEC_PRORES) mp4_write_stss(mux, track); // ctts (non-fragmented only) if (track->needs_ctts && !fragmented) mp4_write_ctts(mux, track); // stsc mp4_write_stsc(mux, track, fragmented); // stsz mp4_write_stsz(mux, track, fragmented); // stco mp4_write_stco(mux, track, fragmented); if (!fragmented) { /* AAC and Opus require a pre-roll to get correct decoder * output, sgpd and sbgp are used to create a "roll" group. */ if (track->codec == CODEC_AAC) { // sgpd mp4_write_sgpd_aac(mux); // sbgp mp4_write_sbgp_aac(mux, track); } else if (track->codec == CODEC_OPUS) { // sgpd + sbgp mp4_write_sbgp_sbgp_opus(mux, track); } } return write_box_size(s, start); } /// 8.7.2.2 DataEntryUrlBox static size_t mp4_write_url(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "url ", 0, 1); /* empty, flag 1 means data is in this file */ return write_box_size(s, start); } /// 8.7.2 Data Reference Box static size_t mp4_write_dref(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "dref ", 0, 0); s_wb32(s, 1); // entry_count mp4_write_url(mux); return write_box_size(s, start); } /// 8.7.1 Data Information Box static size_t mp4_write_dinf(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "dinf"); mp4_write_dref(mux); return write_box_size(s, start); } /// 8.4.4 Media Information Box static size_t mp4_write_minf(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "minf"); // vmhd/smhd/gmhd if (track->type == TRACK_VIDEO) mp4_write_vmhd(mux); else if (track->type == TRACK_CHAPTERS) mp4_write_gmhd(mux); else mp4_write_smhd(mux); // hdlr for dinf, required in MOV only if (mux->flavor == FLAVOR_MOV) mp4_write_hdlr(mux, NULL); // dinf, unnecessary but mandatory mp4_write_dinf(mux); // stbl mp4_write_stbl(mux, track, fragmented); return write_box_size(s, start); } /// 8.4.1 Media Box static size_t mp4_write_mdia(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "mdia"); // mdhd mp4_write_mdhd(mux, track); // hdlr mp4_write_hdlr(mux, track); // minf mp4_write_minf(mux, track, fragmented); return write_box_size(s, start); } /// (QTFF/Apple) User data atom static size_t mp4_write_udta_atom(struct mp4_mux *mux, const char tag[4], const char *val) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, tag); s_write(s, val, strlen(val)); return write_box_size(s, start); } /// 8.10.1 User Data Box static size_t mp4_write_track_udta(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "udta"); /* Our udta box contains QuickTime format user data atoms, which are * simple key-value pairs. Some are prefixed with 0xa9. */ const char *name = obs_encoder_get_name(track->encoder); if (name) mp4_write_udta_atom(mux, "name", name); if (mux->flags & MP4_WRITE_ENCODER_INFO) { const char *id = obs_encoder_get_id(track->encoder); if (name) mp4_write_udta_atom(mux, "\251enc", id); obs_data_t *settings = obs_encoder_get_settings(track->encoder); if (settings) { const char *json = obs_data_get_json_with_defaults(settings); mp4_write_udta_atom(mux, "json", json); obs_data_release(settings); } } return write_box_size(s, start); } /// 8.6.6 Edit List Box static size_t mp4_write_elst(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "elst", 0, 0); s_wb32(s, 1); // entry count uint64_t duration = util_mul_div64(track->duration, 1000, track->timebase_den); uint64_t delay = 0; if (track->type == TRACK_VIDEO && !(mux->flags & MP4_USE_NEGATIVE_CTS)) { /* Compensate for frame-reordering delay (for example, when * using b-frames). */ int64_t dts_offset = 0; if (track->offsets.num) { struct sample_offset sample = track->offsets.array[0]; dts_offset = sample.offset; } else if (track->packets.size) { /* If no offset data exists yet (i.e. when writing the * incomplete moov in a fragmented file) use the raw * data from the current queued packets instead. */ struct encoder_packet pkt; deque_peek_front(&track->packets, &pkt, sizeof(pkt)); dts_offset = pkt.pts - pkt.dts; } delay = util_mul_div64(dts_offset, track->timescale, track->timebase_den); } else if (track->type == TRACK_AUDIO && track->first_pts < 0) { delay = util_mul_div64(llabs(track->first_pts), track->timescale, track->timebase_den); /* Subtract priming delay from total duration */ duration -= util_mul_div64(delay, 1000, track->timescale); } s_wb32(s, (uint32_t)duration); // segment_duration (movie timescale) s_wb32(s, (uint32_t)delay); // media_time (track timescale) s_wb32(s, 1 << 16); // media_rate return write_box_size(s, start); } /// 8.6.5 Edit Box static size_t mp4_write_edts(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "edts"); mp4_write_elst(mux, track); return write_box_size(s, start); } /// 8.3.3.2 TrackReferenceTypeBox static size_t mp4_write_chap(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /// QTFF/Apple chapter track reference write_box(s, 0, "chap"); s_wb32(s, mux->chapter_track->track_id); return write_box_size(s, start); } /// 8.3.3 Track Reference Box static size_t mp4_write_tref(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "tref"); mp4_write_chap(mux); return write_box_size(s, start); } /// 8.3.1 Track Box static size_t mp4_write_trak(struct mp4_mux *mux, struct mp4_track *track, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); /* If track has no data, omit it from full moov. */ if (!fragmented && !track->chunks.num) return 0; write_box(s, 0, "trak"); // tkhd mp4_write_tkhd(mux, track); // edts mp4_write_edts(mux, track); // tref if (mux->chapter_track && track->type != TRACK_CHAPTERS) mp4_write_tref(mux); // mdia mp4_write_mdia(mux, track, fragmented); // udta (audio track name mainly) mp4_write_track_udta(mux, track); return write_box_size(s, start); } /// 8.8.3 Track Extends Box static size_t mp4_write_trex(struct mp4_mux *mux, uint32_t track_id) { struct serializer *s = mux->serializer; write_fullbox(s, 32, "trex", 0, 0); s_wb32(s, track_id); // track_ID s_wb32(s, 1); // default_sample_description_index s_wb32(s, 0); // default_sample_duration s_wb32(s, 0); // default_sample_size s_wb32(s, 0); // default_sample_flags return 32; } /// 8.8.1 Movie Extends Box static size_t mp4_write_mvex(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "mvex"); for (size_t track_id = 0; track_id < mux->tracks.num; track_id++) mp4_write_trex(mux, (uint32_t)(track_id + 1)); return write_box_size(s, start); } /// (QTFF/Apple) Undocumented QuickTime/iTunes metadata handler static size_t mp4_write_itunes_hdlr(struct mp4_mux *mux) { struct serializer *s = mux->serializer; write_fullbox(s, 33, "hdlr", 0, 0); s_wb32(s, 0); // pre_defined s_write(s, "mdir", 4); // handler_type // reserved s_write(s, "appl", 4); s_wb32(s, 0); s_wb32(s, 0); s_w8(s, 0); // name (NULL) return 33; } /// (QTFF/Apple) Data atom static size_t mp4_write_data_atom(struct mp4_mux *mux, const char *data) { struct serializer *s = mux->serializer; size_t len = strlen(data); uint32_t size = 16 + (uint32_t)len; write_box(s, size, "data"); s_wb32(s, 1); // type, 1 = utf-8 string s_wb32(s, 0); // locale, 0 = default s_write(s, data, len); return size; } /// (QTFF/Apple) String atom static size_t mp4_write_string_data_atom(struct mp4_mux *mux, const char name[4], const char *data) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint16_t len = (uint16_t)strlen(data); write_box(s, 0, name); s_wb16(s, len); // String length s_write(s, "\x55\xC4", 2); // language code, just using undefined s_write(s, data, len); // Note: No NULL terminator return write_box_size(s, start); } /// (QTFF/Apple) Metadata item atom static size_t mp4_write_ilst_item_atom(struct mp4_mux *mux, const char name[4], const char *value) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, name); mp4_write_data_atom(mux, value); return write_box_size(s, start); } /// (QTFF/Apple) Metadata item list atom static size_t mp4_write_ilst(struct mp4_mux *mux) { struct serializer *s = mux->serializer; struct dstr value = {0}; int64_t start = serializer_get_pos(s); write_box(s, 0, "ilst"); /* Encoder name */ dstr_cat(&value, "OBS Studio ("); dstr_cat(&value, obs_get_version_string()); dstr_cat(&value, ")"); /* Some QuickTime keys are prefixed with 0xa9 */ mp4_write_ilst_item_atom(mux, "\251too", value.array); dstr_free(&value); return write_box_size(s, start); } /// (QTFF/Apple) Key value metadata handler static size_t mp4_write_mdta_hdlr(struct mp4_mux *mux) { struct serializer *s = mux->serializer; write_fullbox(s, 33, "hdlr", 0, 0); s_wb32(s, 0); // pre_defined s_write(s, "mdta", 4); // handler_type // reserved s_wb32(s, 0); s_wb32(s, 0); s_wb32(s, 0); s_w8(s, 0); // name (NULL) return 33; } /// (QTFF/Apple) Metadata item keys atom static size_t mp4_write_mdta_keys(struct mp4_mux *mux, obs_data_t *meta) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "keys", 0, 0); uint32_t count = 0; int64_t count_pos = serializer_get_pos(s); s_wb32(s, count); // count obs_data_item_t *item = obs_data_first(meta); for (; item != NULL; obs_data_item_next(&item)) { const char *name = obs_data_item_get_name(item); size_t len = strlen(name); /* name is key type, can be udta or mdta */ write_box(s, len + 8, "mdta"); s_write(s, name, len); // key name count++; } int64_t end = serializer_get_pos(s); /* Overwrite count with correct value */ serializer_seek(s, count_pos, SERIALIZE_SEEK_START); s_wb32(s, count); serializer_seek(s, end, SERIALIZE_SEEK_START); return write_box_size(s, start); } /// (QTFF/Apple) Metadata item atom, but name is an index instead static inline void write_key_entry(struct mp4_mux *mux, obs_data_item_t *item, uint32_t idx) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); s_wb32(s, 0); // size s_wb32(s, idx); // index mp4_write_data_atom(mux, obs_data_item_get_string(item)); write_box_size(s, start); } /// (QTFF/Apple) Metadata item list atom static size_t mp4_write_mdta_ilst(struct mp4_mux *mux, obs_data_t *meta) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "ilst"); /* indices start with 1 */ uint32_t key_idx = 1; obs_data_item_t *item = obs_data_first(meta); for (; item != NULL; obs_data_item_next(&item)) { write_key_entry(mux, item, key_idx); key_idx++; } return write_box_size(s, start); } static void mp4_write_mdta_kv(struct mp4_mux *mux) { struct dstr value = {0}; obs_data_t *meta = obs_data_create(); dstr_cat(&value, "OBS Studio ("); dstr_cat(&value, obs_get_version_string()); dstr_cat(&value, ")"); // ToDo figure out what else we could put in here for fun and profit :) obs_data_set_string(meta, "tool", value.array); /* Write keys */ mp4_write_mdta_keys(mux, meta); /* Write values */ mp4_write_mdta_ilst(mux, meta); obs_data_release(meta); dstr_free(&value); } /// 8.11.1 The Meta box static size_t mp4_write_meta(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_fullbox(s, 0, "meta", 0, 0); if (mux->flags & MP4_USE_MDTA_KEY_VALUE) { mp4_write_mdta_hdlr(mux); mp4_write_mdta_kv(mux); } else { mp4_write_itunes_hdlr(mux); mp4_write_ilst(mux); } return write_box_size(s, start); } /// 8.10.1 User Data Box static size_t mp4_write_udta(struct mp4_mux *mux) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "udta"); /* Normally metadata would be directly in the moov, but since this is * Apple/QTFF format metadata it is inside udta. */ if (mux->flavor == FLAVOR_MOV && !(mux->flags & MP4_USE_MDTA_KEY_VALUE)) { // keys directly in udta atom struct dstr value = {0}; /* Encoder name */ dstr_cat(&value, "OBS Studio ("); dstr_cat(&value, obs_get_version_string()); dstr_cat(&value, ")"); mp4_write_string_data_atom(mux, "\251swr", value.array); dstr_free(&value); } else { // meta mp4_write_meta(mux); } return write_box_size(s, start); } /// Movie Box (8.2.1) static size_t mp4_write_moov(struct mp4_mux *mux, bool fragmented) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "moov"); mp4_write_mvhd(mux); // trak(s) for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; mp4_write_trak(mux, track, fragmented); } if (!fragmented && mux->chapter_track) mp4_write_trak(mux, mux->chapter_track, false); // mvex if (fragmented) mp4_write_mvex(mux); // udta (metadata) mp4_write_udta(mux); return write_box_size(s, start); } /* ========================================================================== */ /* moof (fragment header) stuff */ /// 8.8.5 Movie Fragment Header Box static size_t mp4_write_mfhd(struct mp4_mux *mux) { struct serializer *s = mux->serializer; write_fullbox(s, 16, "mfhd", 0, 0); s_wb32(s, mux->fragments_written); // sequence_number return 16; } /// 8.8.7 Track Fragment Header Box static size_t mp4_write_tfhd(struct mp4_mux *mux, struct mp4_track *track, size_t moof_start) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint32_t flags = BASE_DATA_OFFSET_PRESENT | DEFAULT_SAMPLE_FLAGS_PRESENT; /* Add default size/duration if all samples match. */ bool durations_match = true; bool sizes_match = true; uint32_t duration; uint32_t sample_size; if (track->sample_size) { duration = 1; sample_size = track->sample_size; } else { duration = track->fragment_samples.array[0].duration; sample_size = track->fragment_samples.array[0].size; for (size_t idx = 1; idx < track->fragment_samples.num; idx++) { uint32_t frag_duration = track->fragment_samples.array[idx].duration; uint32_t frag_size = track->fragment_samples.array[idx].size; durations_match = frag_duration == duration; sizes_match = frag_size == sample_size; } } if (durations_match) flags |= DEFAULT_SAMPLE_DURATION_PRESENT; if (sizes_match) flags |= DEFAULT_SAMPLE_SIZE_PRESENT; write_fullbox(s, 0, "tfhd", 0, flags); s_wb32(s, track->track_id); // track_ID s_wb64(s, moof_start); // base_data_offset // default_sample_duration if (durations_match) { if (track->type == TRACK_VIDEO) { /* Convert duration to track timescale */ duration = (uint32_t)util_mul_div64(duration, track->timescale, track->timebase_den); } s_wb32(s, duration); } // default_sample_size if (sizes_match) s_wb32(s, sample_size); // default_sample_flags if (track->type == TRACK_VIDEO) { s_wb32(s, SAMPLE_FLAG_DEPENDS_YES | SAMPLE_FLAG_IS_NON_SYNC); } else { s_wb32(s, SAMPLE_FLAG_DEPENDS_NO); } return write_box_size(s, start); } /// 8.8.12 Track fragment decode time static size_t mp4_write_tfdt(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; write_fullbox(s, 20, "tfdt", 1, 0); /* Subtract samples that are not written yet */ uint64_t duration_written = track->duration; for (size_t i = 0; i < track->fragment_samples.num; i++) duration_written -= track->fragment_samples.array[i].duration; if (track->type == TRACK_VIDEO) { /* Convert to track timescale */ duration_written = util_mul_div64(duration_written, track->timescale, track->timebase_den); } s_wb64(s, duration_written); // baseMediaDecodeTime return 20; } /// 8.8.8 Track Fragment Run Box static size_t mp4_write_trun(struct mp4_mux *mux, struct mp4_track *track, uint32_t moof_size, uint64_t *samples_mdat_offset) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); uint32_t flags = DATA_OFFSET_PRESENT; if (!track->sample_size) flags |= SAMPLE_SIZE_PRESENT; if (track->type == TRACK_VIDEO) { flags |= FIRST_SAMPLE_FLAGS_PRESENT; flags |= SAMPLE_COMPOSITION_TIME_OFFSETS_PRESENT; } uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0; write_fullbox(s, 0, "trun", version, flags); /* moof_size + 8 bytes for mdat header + offset into mdat box data */ size_t data_offset = moof_size + 8 + *samples_mdat_offset; size_t sample_count = track->fragment_samples.num; if (track->sample_size) { /* Update count based on fixed size */ size_t total_size = 0; for (size_t i = 0; i < sample_count; i++) total_size += track->fragment_samples.array[i].size; *samples_mdat_offset += total_size; sample_count = total_size / track->sample_size; } s_wb32(s, (uint32_t)sample_count); // sample_count s_wb32(s, (uint32_t)data_offset); // data_offset /* If we have a fixed sample size (PCM audio) we only need to write * the sample count and offset. */ if (track->sample_size) return write_box_size(s, start); if (track->type == TRACK_VIDEO) s_wb32(s, SAMPLE_FLAG_DEPENDS_NO); // first_sample_flags for (size_t idx = 0; idx < sample_count; idx++) { struct fragment_sample *smp = &track->fragment_samples.array[idx]; s_wb32(s, smp->size); // sample_size if (track->type == TRACK_VIDEO) { // sample_composition_time_offset int64_t offset = (int64_t)smp->offset * (int64_t)track->timescale / (int64_t)track->timebase_den; s_wb32(s, (uint32_t)offset); } *samples_mdat_offset += smp->size; } return write_box_size(s, start); } /// 8.8.6 Track Fragment Box static size_t mp4_write_traf(struct mp4_mux *mux, struct mp4_track *track, int64_t moof_start, uint32_t moof_size, uint64_t *samples_mdat_offset) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "traf"); // tfhd mp4_write_tfhd(mux, track, moof_start); // tfdt mp4_write_tfdt(mux, track); // trun mp4_write_trun(mux, track, moof_size, samples_mdat_offset); return write_box_size(s, start); } /// 8.8.4 Movie Fragment Box static size_t mp4_write_moof(struct mp4_mux *mux, uint32_t moof_size, int64_t moof_start) { struct serializer *s = mux->serializer; int64_t start = serializer_get_pos(s); write_box(s, 0, "moof"); mp4_write_mfhd(mux); /* Track current mdat offset across tracks */ uint64_t samples_mdat_offset = 0; // traf boxes for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; /* Skip tracks that do not have any samples */ if (!track->fragment_samples.num) continue; mp4_write_traf(mux, track, moof_start, moof_size, &samples_mdat_offset); } return write_box_size(s, start); } /* ========================================================================== */ /* Chapter packets */ static void mp4_create_chapter_pkt(struct encoder_packet *pkt, int64_t dts_usec, const char *name) { int64_t dts = dts_usec / 1000; // chapter track uses a ms timebase pkt->pts = dts; pkt->dts = dts; pkt->dts_usec = dts_usec; pkt->timebase_num = 1; pkt->timebase_den = 1000; /* Serialize with data with ref count */ struct serializer s; struct array_output_data ao; array_output_serializer_init(&s, &ao); size_t len = min(strlen(name), UINT16_MAX); long refs = 1; /* encoder_packet refs */ s_write(&s, &refs, sizeof(refs)); /* actual packet data */ s_wb16(&s, (uint16_t)len); s_write(&s, name, len); s_write(&s, &CHAPTER_PKT_FOOTER, sizeof(CHAPTER_PKT_FOOTER)); pkt->data = (void *)(ao.bytes.array + sizeof(long)); pkt->size = ao.bytes.num - sizeof(long); } /* ========================================================================== */ /* Encoder packet processing and fragment writer */ static inline int64_t packet_pts_usec(struct encoder_packet *packet) { return packet->pts * 1000000 / packet->timebase_den; } static inline struct encoder_packet *get_pkt_at(struct deque *dq, size_t idx) { return deque_data(dq, idx * sizeof(struct encoder_packet)); } static inline uint64_t get_longest_track_duration(struct mp4_mux *mux) { uint64_t dur = 0; for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; uint64_t track_dur = util_mul_div64(track->duration, 1000, track->timebase_den); if (track_dur > dur) dur = track_dur; } return dur; } static void process_packets(struct mp4_mux *mux, struct mp4_track *track, uint64_t *mdat_size) { size_t count = track->packets.size / sizeof(struct encoder_packet); if (!count) return; /* Only iterate upt to penultimate packet so we can determine duration * for all processed packets. */ for (size_t i = 0; i < count - 1; i++) { struct encoder_packet *pkt = get_pkt_at(&track->packets, i); if (mux->next_frag_pts && packet_pts_usec(pkt) >= mux->next_frag_pts) break; struct encoder_packet *next = get_pkt_at(&track->packets, i + 1); /* Duration is just distance between current and next DTS. */ uint32_t duration = (uint32_t)(next->dts - pkt->dts); uint32_t sample_count = 1; uint32_t size = (uint32_t)pkt->size; int32_t offset = (int32_t)(pkt->pts - pkt->dts); /* When using negative CTS, subtract DTS-PTS offset. */ if (track->type == TRACK_VIDEO && mux->flags & MP4_USE_NEGATIVE_CTS) { if (!track->offsets.num) track->dts_offset = offset; offset -= track->dts_offset; } /* Create temporary sample information for moof */ struct fragment_sample *smp = da_push_back_new(track->fragment_samples); smp->size = size; smp->offset = offset; smp->duration = duration; *mdat_size += size; /* Update global sample information for full moov */ track->duration += duration; if (track->sample_size) { /* Adjust duration/count for fixed sample size */ sample_count = size / track->sample_size; duration = 1; } if (!track->samples) track->first_pts = pkt->pts; track->samples += sample_count; /* If delta (duration) matche sprevious, increment counter, * otherwise create a new entry. */ if (track->deltas.num == 0 || track->deltas.array[track->deltas.num - 1].delta != duration) { struct sample_delta *new = da_push_back_new(track->deltas); new->delta = duration; new->count = sample_count; } else { track->deltas.array[track->deltas.num - 1].count += sample_count; } if (!track->sample_size) da_push_back(track->sample_sizes, &size); if (track->type != TRACK_VIDEO) continue; if (pkt->keyframe) da_push_back(track->sync_samples, &track->samples); /* Only require ctts box if offet is non-zero */ if (offset && !track->needs_ctts) track->needs_ctts = true; /* If dts-pts offset matche sprevious, increment counter, * otherwise create a new entry. */ if (track->offsets.num == 0 || track->offsets.array[track->offsets.num - 1].offset != offset) { struct sample_offset *new = da_push_back_new(track->offsets); new->offset = offset; new->count = 1; } else { track->offsets.array[track->offsets.num - 1].count += 1; } } } /* Write track data to file */ static void write_packets(struct mp4_mux *mux, struct mp4_track *track) { struct serializer *s = mux->serializer; size_t count = track->packets.size / sizeof(struct encoder_packet); if (!count || !track->fragment_samples.num) return; struct chunk *chk = da_push_back_new(track->chunks); chk->offset = serializer_get_pos(s); chk->samples = (uint32_t)track->fragment_samples.num; for (size_t i = 0; i < track->fragment_samples.num; i++) { struct encoder_packet pkt; deque_pop_front(&track->packets, &pkt, sizeof(struct encoder_packet)); s_write(s, pkt.data, pkt.size); obs_encoder_packet_release(&pkt); } chk->size = (uint32_t)(serializer_get_pos(s) - chk->offset); /* Fixup sample count for fixed-size codecs */ if (track->sample_size) chk->samples = chk->size / track->sample_size; da_clear(track->fragment_samples); } static void mp4_flush_fragment(struct mp4_mux *mux) { struct serializer *s = mux->serializer; // Write file header if not already done if (!mux->fragments_written) { mp4_write_ftyp(mux, true); /* Placeholder to write mdat header during soft-remux */ mux->placeholder_offset = serializer_get_pos(s); mp4_write_free(mux); } // Array output as temporary buffer to avoid sending seeks to disk struct serializer as; struct array_output_data aod; array_output_serializer_init(&as, &aod); mux->serializer = &as; // Write initial incomplete moov (because fragmentation) if (!mux->fragments_written) { mp4_write_moov(mux, true); s_write(s, aod.bytes.array, aod.bytes.num); array_output_serializer_reset(&aod); } mux->fragments_written++; /* --------------------------------------------------------- */ /* Analyse packets and create fragment moof. */ uint64_t mdat_size = 8; for (size_t idx = 0; idx < mux->tracks.num; idx++) { struct mp4_track *track = &mux->tracks.array[idx]; process_packets(mux, track, &mdat_size); } if (!mux->next_frag_pts && mux->chapter_track) { // Create dummy chapter marker at the end so duration is correct uint64_t duration = get_longest_track_duration(mux); struct encoder_packet pkt; mp4_create_chapter_pkt(&pkt, (int64_t)duration * 1000, "Dummy"); deque_push_back(&mux->chapter_track->packets, &pkt, sizeof(struct encoder_packet)); process_packets(mux, mux->chapter_track, &mdat_size); } // write moof once to get size int64_t moof_start = serializer_get_pos(s); size_t moof_size = mp4_write_moof(mux, 0, moof_start); array_output_serializer_reset(&aod); // write moof again with known size mp4_write_moof(mux, (uint32_t)moof_size, moof_start); // Write to output and restore real serializer s_write(s, aod.bytes.array, aod.bytes.num); mux->serializer = s; array_output_serializer_free(&aod); /* --------------------------------------------------------- */ /* Write audio and video samples (in chunks). Also update */ /* global chunk and sample information for final moov. */ if (mdat_size > UINT32_MAX) { s_wb32(s, 1); s_write(s, "mdat", 4); s_wb64(s, mdat_size + 8); } else { s_wb32(s, (uint32_t)mdat_size); s_write(s, "mdat", 4); } for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *track = &mux->tracks.array[i]; write_packets(mux, track); } /* Only write chapter packets on final flush. */ if (!mux->next_frag_pts && mux->chapter_track) write_packets(mux, mux->chapter_track); mux->next_frag_pts = 0; } /* ========================================================================== */ /* Track object functions */ static inline void track_insert_packet(struct mp4_track *track, struct encoder_packet *pkt) { int64_t pts_usec = packet_pts_usec(pkt); if (pts_usec > track->last_pts_usec) track->last_pts_usec = pts_usec; deque_push_back(&track->packets, pkt, sizeof(struct encoder_packet)); } static inline uint32_t get_sample_size(struct mp4_track *track) { audio_t *audio = obs_encoder_audio(track->encoder); if (!audio) return 0; const struct audio_output_info *info = audio_output_get_info(audio); uint32_t channels = get_audio_channels(info->speakers); switch (track->codec) { case CODEC_PCM_F32: return channels * 4; // 4 bytes per sample (32-bit) case CODEC_PCM_I24: return channels * 3; // 3 bytes per sample (24-bit) case CODEC_PCM_I16: return channels * 2; // 2 bytes per sample (16-bit) default: return 0; } } static inline enum mp4_codec get_codec(obs_encoder_t *enc) { const char *codec = obs_encoder_get_codec(enc); if (strcmp(codec, "h264") == 0) return CODEC_H264; if (strcmp(codec, "hevc") == 0) return CODEC_HEVC; if (strcmp(codec, "av1") == 0) return CODEC_AV1; if (strcmp(codec, "prores") == 0) return CODEC_PRORES; if (strcmp(codec, "aac") == 0) return CODEC_AAC; if (strcmp(codec, "opus") == 0) return CODEC_OPUS; if (strcmp(codec, "flac") == 0) return CODEC_FLAC; if (strcmp(codec, "alac") == 0) return CODEC_ALAC; if (strcmp(codec, "pcm_s16le") == 0) return CODEC_PCM_I16; if (strcmp(codec, "pcm_s24le") == 0) return CODEC_PCM_I24; if (strcmp(codec, "pcm_f32le") == 0) return CODEC_PCM_F32; return CODEC_UNKNOWN; } static inline void add_track(struct mp4_mux *mux, obs_encoder_t *enc) { struct mp4_track *track = da_push_back_new(mux->tracks); track->type = obs_encoder_get_type(enc) == OBS_ENCODER_VIDEO ? TRACK_VIDEO : TRACK_AUDIO; track->encoder = obs_encoder_get_ref(enc); track->codec = get_codec(enc); track->track_id = ++mux->track_ctr; /* Set timebase/timescale */ if (track->type == TRACK_VIDEO) { video_t *video = obs_encoder_video(enc); const struct video_output_info *info = video_output_get_info(video); track->timebase_num = info->fps_den; track->timebase_den = info->fps_num; track->timescale = track->timebase_den; } else { uint32_t sample_rate = obs_encoder_get_sample_rate(enc); /* Opus is always 48 kHz */ if (track->codec == CODEC_OPUS) sample_rate = 48000; track->timebase_num = 1; track->timebase_den = sample_rate; track->timescale = sample_rate; } /* Set sample size (if fixed) */ if (track->type == TRACK_AUDIO) track->sample_size = get_sample_size(track); } static inline void add_chapter_track(struct mp4_mux *mux) { mux->chapter_track = bzalloc(sizeof(struct mp4_track)); mux->chapter_track->type = TRACK_CHAPTERS; mux->chapter_track->codec = CODEC_TEXT; mux->chapter_track->timescale = 1000; mux->chapter_track->timebase_num = 1; mux->chapter_track->timebase_den = 1000; mux->chapter_track->track_id = ++mux->track_ctr; } static inline void free_packets(struct deque *dq) { size_t num = dq->size / sizeof(struct encoder_packet); for (size_t i = 0; i < num; i++) { struct encoder_packet pkt; deque_pop_front(dq, &pkt, sizeof(struct encoder_packet)); obs_encoder_packet_release(&pkt); } } static inline void free_track(struct mp4_track *track) { if (!track) return; obs_encoder_release(track->encoder); free_packets(&track->packets); deque_free(&track->packets); da_free(track->sample_sizes); da_free(track->chunks); da_free(track->deltas); da_free(track->offsets); da_free(track->sync_samples); da_free(track->fragment_samples); } /* ===========================================================================*/ /* API */ struct mp4_mux *mp4_mux_create(obs_output_t *output, struct serializer *serializer, enum mp4_mux_flags flags, enum mp4_flavor flavor) { struct mp4_mux *mux = bzalloc(sizeof(struct mp4_mux)); mux->output = output; mux->serializer = serializer; mux->flags = flags; mux->flavor = flavor; /* Timestamp is based on 1904 rather than 1970. */ mux->creation_time = time(NULL) + 0x7C25B080; if (flavor == FLAVOR_MOV && mux->creation_time > UINT32_MAX) { /* This will only happen in 2040 but better safe than sorry! */ warn("Creation time too large for MOV, setting to 0 (unset)."); mux->creation_time = 0; } for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) { obs_encoder_t *enc = obs_output_get_video_encoder2(output, i); if (!enc) continue; add_track(mux, enc); } for (size_t i = 0; i < MAX_OUTPUT_AUDIO_ENCODERS; i++) { obs_encoder_t *enc = obs_output_get_audio_encoder(output, i); if (!enc) continue; add_track(mux, enc); } return mux; } void mp4_mux_destroy(struct mp4_mux *mux) { for (size_t i = 0; i < mux->tracks.num; i++) free_track(&mux->tracks.array[i]); free_track(mux->chapter_track); bfree(mux->chapter_track); da_free(mux->tracks); bfree(mux); } bool mp4_mux_submit_packet(struct mp4_mux *mux, struct encoder_packet *pkt) { struct mp4_track *track = NULL; struct encoder_packet parsed_packet; enum obs_encoder_type type = pkt->type; bool fragment_ready = mux->next_frag_pts > 0; for (size_t i = 0; i < mux->tracks.num; i++) { struct mp4_track *tmp = &mux->tracks.array[i]; fragment_ready = fragment_ready && tmp->last_pts_usec >= mux->next_frag_pts; if (tmp->encoder == pkt->encoder) track = tmp; } if (!track) { warn("Could not find track for packet of type %s with " "track id %zu!", type == OBS_ENCODER_VIDEO ? "video" : "audio", pkt->track_idx); return false; } /* If all tracks have caught up to the keyframe we want to fragment on, * flush the current fragment to disk. */ if (fragment_ready) mp4_flush_fragment(mux); if (type == OBS_ENCODER_AUDIO) { obs_encoder_packet_ref(&parsed_packet, pkt); } else { if (track->codec == CODEC_H264) obs_parse_avc_packet(&parsed_packet, pkt); else if (track->codec == CODEC_HEVC) obs_parse_hevc_packet(&parsed_packet, pkt); else if (track->codec == CODEC_AV1) obs_parse_av1_packet(&parsed_packet, pkt); else if (track->codec == CODEC_PRORES) obs_encoder_packet_ref(&parsed_packet, pkt); /* Set fragmentation PTS if packet is keyframe and PTS > 0 */ if (parsed_packet.keyframe && parsed_packet.pts > 0) { mux->next_frag_pts = packet_pts_usec(&parsed_packet); } } track_insert_packet(track, &parsed_packet); return true; } bool mp4_mux_add_chapter(struct mp4_mux *mux, int64_t dts_usec, const char *name) { if (dts_usec < 0) return false; if (!mux->chapter_track) add_chapter_track(mux); /* To work correctly there needs to be a chapter at PTS 0, * create that here if necessary. */ if (dts_usec > 0 && mux->chapter_track->packets.size == 0) { mp4_mux_add_chapter(mux, 0, obs_module_text("MP4Output.StartChapter")); } /* Create packets that will be muxed on final flush */ struct encoder_packet pkt; mp4_create_chapter_pkt(&pkt, dts_usec, name); track_insert_packet(mux->chapter_track, &pkt); return true; } bool mp4_mux_finalise(struct mp4_mux *mux) { struct serializer *s = mux->serializer; /* Flush remaining audio/video samples as final fragment. */ info("Flushing final fragment..."); /* Set target PTS to zero to indicate that we want to flush all * the remaining packets */ mux->next_frag_pts = 0; mp4_flush_fragment(mux); info("Number of fragments: %u", mux->fragments_written); if (mux->flags & MP4_SKIP_FINALISATION) { warn("Skipping finalization!"); return true; } int64_t data_end = serializer_get_pos(s); /* ---------------------------------------- */ /* Write full moov box */ /* Use array serializer for moov data as this will do a lot * of seeks to write size values of variable-size boxes. */ struct serializer fs; struct array_output_data ao; array_output_serializer_init(&fs, &ao); mux->serializer = &fs; mp4_write_moov(mux, false); s_write(s, ao.bytes.array, ao.bytes.num); info("Full moov size: %zu KiB", ao.bytes.num / 1024); mux->serializer = s; // restore real serializer array_output_serializer_free(&ao); /* ---------------------------------------- */ /* Overwrite file header (ftyp + free/moov) */ serializer_seek(s, 0, SERIALIZE_SEEK_START); mp4_write_ftyp(mux, false); size_t data_size = data_end - mux->placeholder_offset; serializer_seek(s, (int64_t)mux->placeholder_offset, SERIALIZE_SEEK_START); /* If data is more than 4 GiB the mdat header becomes 16 bytes, hence * why we create a 16-byte placeholder "free" box at the start. */ if (data_size > UINT32_MAX) { s_wb32(s, 1); // 1 = use "largesize" field instead s_write(s, "mdat", 4); s_wb64(s, data_size); // largesize (64-bit) } else { s_wb32(s, (uint32_t)data_size); s_write(s, "mdat", 4); } info("Final mdat size: %zu KiB", data_size / 1024); return true; }