577 lines
18 KiB
Raw Normal View History

2024-09-30 16:12:57 +00:00
#include "mov-internal.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
// stsd: Sample Description Box
int mp4_read_extra(struct mov_t* mov, const struct mov_box_t* box)
int r;
uint64_t p1, p2;
p1 = mov_buffer_tell(&mov->io);
r = mov_reader_box(mov, box);
p2 = mov_buffer_tell(&mov->io);
mov_buffer_skip(&mov->io, box->size - (p2 - p1));
return r;
aligned(8) abstract class SampleEntry (unsigned int(32) format)
extends Box(format){
const unsigned int(8)[6] reserved = 0;
unsigned int(16) data_reference_index;
static int mov_read_sample_entry(struct mov_t* mov, struct mov_box_t* box, uint16_t* data_reference_index)
box->size = mov_buffer_r32(&mov->io);
box->type = mov_buffer_r32(&mov->io);
mov_buffer_skip(&mov->io, 6); // const unsigned int(8)[6] reserved = 0;
*data_reference_index = (uint16_t)mov_buffer_r16(&mov->io); // ref [dref]
return 0;
class AudioSampleEntry(codingname) extends SampleEntry (codingname){
const unsigned int(32)[2] reserved = 0;
template unsigned int(16) channelcount = 2;
template unsigned int(16) samplesize = 16;
unsigned int(16) pre_defined = 0;
const unsigned int(16) reserved = 0 ;
template unsigned int(32) samplerate = { default samplerate of media}<<16;
static int mov_read_audio(struct mov_t* mov, struct mov_sample_entry_t* entry)
uint16_t qtver;
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
entry->object_type_indication = mov_tag_to_object(box.type);
entry->stream_type = MP4_STREAM_AUDIO;
mov->track->tag = box.type;
#if 0
// const unsigned int(32)[2] reserved = 0;
mov_buffer_skip(&mov->io, 8);
qtver = mov_buffer_r16(&mov->io); /* version */
mov_buffer_r16(&mov->io); /* revision level */
mov_buffer_r32(&mov->io); /* vendor */
entry->u.audio.channelcount = (uint16_t)mov_buffer_r16(&mov->io);
entry->u.audio.samplesize = (uint16_t)mov_buffer_r16(&mov->io);
#if 0
// unsigned int(16) pre_defined = 0;
// const unsigned int(16) reserved = 0 ;
mov_buffer_skip(&mov->io, 4);
mov_buffer_r16(&mov->io); /* audio cid */
mov_buffer_r16(&mov->io); /* packet size = 0 */
entry->u.audio.samplerate = mov_buffer_r32(&mov->io); // { default samplerate of media}<<16;
// audio extra(avc1: ISO/IEC 14496-14:2003(E))
box.size -= 36;
// https://developer.apple.com/library/archive/documentation/QuickTime/QTFF/QTFFChap3/qtff3.html#//apple_ref/doc/uid/TP40000939-CH205-124774
if (1 == qtver && box.size >= 16)
// Sound Sample Description (Version 1)
mov_buffer_r32(&mov->io); // Samples per packet
mov_buffer_r32(&mov->io); // Bytes per packet
mov_buffer_r32(&mov->io); // Bytes per frame
mov_buffer_r32(&mov->io); // Bytes per sample
box.size -= 16;
else if (2 == qtver && box.size >= 36)
// Sound Sample Description (Version 2)
mov_buffer_r32(&mov->io); // sizeOfStructOnly
mov_buffer_r64(&mov->io); // audioSampleRate
mov_buffer_r32(&mov->io); // numAudioChannels
mov_buffer_r32(&mov->io); // always7F000000
mov_buffer_r32(&mov->io); // constBitsPerChannel
mov_buffer_r32(&mov->io); // formatSpecificFlags
mov_buffer_r32(&mov->io); // constBytesPerAudioPacket
mov_buffer_r32(&mov->io); // constLPCMFramesPerAudioPacket
box.size -= 36;
return mp4_read_extra(mov, &box);
class VisualSampleEntry(codingname) extends SampleEntry (codingname){
unsigned int(16) pre_defined = 0;
const unsigned int(16) reserved = 0;
unsigned int(32)[3] pre_defined = 0;
unsigned int(16) width;
unsigned int(16) height;
template unsigned int(32) horizresolution = 0x00480000; // 72 dpi
template unsigned int(32) vertresolution = 0x00480000; // 72 dpi
const unsigned int(32) reserved = 0;
template unsigned int(16) frame_count = 1;
string[32] compressorname;
template unsigned int(16) depth = 0x0018;
int(16) pre_defined = -1;
// other boxes from derived specifications
CleanApertureBox clap; // optional
PixelAspectRatioBox pasp; // optional
class AVCSampleEntry() extends VisualSampleEntry ('avc1'){
AVCConfigurationBox config;
MPEG4BitRateBox (); // optional
MPEG4ExtensionDescriptorsBox (); // optional
class AVC2SampleEntry() extends VisualSampleEntry ('avc2'){
AVCConfigurationBox avcconfig;
MPEG4BitRateBox bitrate; // optional
MPEG4ExtensionDescriptorsBox descr; // optional
extra_boxes boxes; // optional
static int mov_read_video(struct mov_t* mov, struct mov_sample_entry_t* entry)
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
entry->object_type_indication = mov_tag_to_object(box.type);
entry->stream_type = MP4_STREAM_VISUAL;
mov->track->tag = box.type;
#if 1
//unsigned int(16) pre_defined = 0;
//const unsigned int(16) reserved = 0;
//unsigned int(32)[3] pre_defined = 0;
mov_buffer_skip(&mov->io, 16);
mov_buffer_r16(&mov->io); /* version */
mov_buffer_r16(&mov->io); /* revision level */
mov_buffer_r32(&mov->io); /* vendor */
mov_buffer_r32(&mov->io); /* temporal quality */
mov_buffer_r32(&mov->io); /* spatial quality */
entry->u.visual.width = (uint16_t)mov_buffer_r16(&mov->io);
entry->u.visual.height = (uint16_t)mov_buffer_r16(&mov->io);
entry->u.visual.horizresolution = mov_buffer_r32(&mov->io); // 0x00480000 - 72 dpi
entry->u.visual.vertresolution = mov_buffer_r32(&mov->io); // 0x00480000 - 72 dpi
// const unsigned int(32) reserved = 0;
mov_buffer_r32(&mov->io); /* data size, always 0 */
entry->u.visual.frame_count = (uint16_t)mov_buffer_r16(&mov->io);
//string[32] compressorname;
//uint32_t len = mov_buffer_r8(&mov->io);
//mov_buffer_skip(&mov->io, len);
mov_buffer_skip(&mov->io, 32);
entry->u.visual.depth = (uint16_t)mov_buffer_r16(&mov->io);
// int(16) pre_defined = -1;
mov_buffer_skip(&mov->io, 2);
// video extra(avc1: ISO/IEC 14496-15:2010(E))
box.size -= 86;
return mp4_read_extra(mov, &box);
class PixelAspectRatioBox extends Box(pasp){
unsigned int(32) hSpacing;
unsigned int(32) vSpacing;
int mov_read_pasp(struct mov_t* mov, const struct mov_box_t* box)
return 0;
static int mov_read_hint_sample_entry(struct mov_t* mov, struct mov_sample_entry_t* entry)
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
mov_buffer_skip(&mov->io, box.size - 16);
entry->object_type_indication = mov_tag_to_object(box.type);
entry->stream_type = MP4_STREAM_VISUAL;
mov->track->tag = box.type;
return mov_buffer_error(&mov->io);
static int mov_read_meta_sample_entry(struct mov_t* mov, struct mov_sample_entry_t* entry)
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
mov_buffer_skip(&mov->io, box.size - 16);
entry->object_type_indication = mov_tag_to_object(box.type);
entry->stream_type = MP4_STREAM_VISUAL;
mov->track->tag = box.type;
return mov_buffer_error(&mov->io);
// ISO/IEC 14496-12:2015(E) 12.5 Text media (p184)
class PlainTextSampleEntry(codingname) extends SampleEntry (codingname) {
class SimpleTextSampleEntry(codingname) extends PlainTextSampleEntry ('stxt') {
string content_encoding; // optional
string mime_format;
BitRateBox (); // optional
TextConfigBox (); // optional
static int mov_read_text_sample_entry(struct mov_t* mov, struct mov_sample_entry_t* entry)
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
if (MOV_TEXT == box.type)
// https://developer.apple.com/documentation/quicktime-file-format/text_sample_description
//mov_buffer_r32(&mov->io); /* display flags */
//mov_buffer_r32(&mov->io); /* text justification */
//mov_buffer_r16(&mov->io); /* background color: 48-bit RGB color */
//mov_buffer_r64(&mov->io); /* default text box (top, left, bottom, right) */
//mov_buffer_r64(&mov->io); /* reserved */
//mov_buffer_r16(&mov->io); /* font number */
//mov_buffer_r16(&mov->io); /* font face */
//mov_buffer_r8(&mov->io); /* reserved */
//mov_buffer_r16(&mov->io); /* reserved */
//mov_buffer_r16(&mov->io); /* foreground color: 48-bit RGB color */
////mov_buffer_r16(&mov->io); /* text name */
mov_buffer_skip(&mov->io, box.size - 16);
mov_buffer_skip(&mov->io, box.size - 16);
entry->object_type_indication = mov_tag_to_object(box.type);
entry->stream_type = MP4_STREAM_VISUAL;
mov->track->tag = box.type;
return mov_buffer_error(&mov->io);
// ISO/IEC 14496-12:2015(E) 12.6 Subtitle media (p185)
class SubtitleSampleEntry(codingname) extends SampleEntry (codingname) {
class XMLSubtitleSampleEntry() extends SubtitleSampleEntry('stpp') {
string namespace;
string schema_location; // optional
string auxiliary_mime_types;
// optional, required if auxiliary resources are present
BitRateBox (); // optional
class TextSubtitleSampleEntry() extends SubtitleSampleEntry('sbtt') {
string content_encoding; // optional
string mime_format;
BitRateBox (); // optional
TextConfigBox (); // optional
class TextSampleEntry() extends SampleEntry('tx3g') {
unsigned int(32) displayFlags;
signed int(8) horizontal-justification;
signed int(8) vertical-justification;
unsigned int(8) background-color-rgba[4];
BoxRecord default-text-box;
StyleRecord default-style;
FontTableBox font-table;
DisparityBox default-disparity;
static int mov_read_subtitle_sample_entry(struct mov_t* mov, struct mov_sample_entry_t* entry)
struct mov_box_t box;
mov_read_sample_entry(mov, &box, &entry->data_reference_index);
box.size -= 16;
if (box.type == MOV_TAG('t', 'x', '3', 'g'))
mov_read_tx3g(mov, &box);
mov_buffer_skip(&mov->io, box.size - 16);
entry->object_type_indication = MOV_OBJECT_TEXT;
entry->stream_type = MP4_STREAM_VISUAL;
mov->track->tag = box.type;
return mov_buffer_error(&mov->io);
int mov_read_stsd(struct mov_t* mov, const struct mov_box_t* box)
uint32_t i, entry_count;
struct mov_track_t* track = mov->track;
entry_count = mov_buffer_r32(&mov->io);
if (track->stsd.entry_count < entry_count)
void* p = realloc(track->stsd.entries, sizeof(track->stsd.entries[0]) * entry_count);
if (NULL == p) return -ENOMEM;
track->stsd.entries = (struct mov_sample_entry_t*)p;
track->stsd.entry_count = entry_count;
for (i = 0; i < entry_count; i++)
track->stsd.current = &track->stsd.entries[i];
memset(track->stsd.current, 0, sizeof(*track->stsd.current));
if (MOV_AUDIO == track->handler_type)
mov_read_audio(mov, &track->stsd.entries[i]);
else if (MOV_VIDEO == track->handler_type)
mov_read_video(mov, &track->stsd.entries[i]);
else if (MOV_HINT == track->handler_type)
mov_read_hint_sample_entry(mov, &track->stsd.entries[i]);
else if (MOV_META == track->handler_type)
mov_read_meta_sample_entry(mov, &track->stsd.entries[i]);
else if (MOV_CLCP == track->handler_type)
mov_read_meta_sample_entry(mov, &track->stsd.entries[i]);
else if (MOV_TEXT == track->handler_type)
mov_read_text_sample_entry(mov, &track->stsd.entries[i]);
else if (MOV_SUBT == track->handler_type || MOV_SBTL == track->handler_type)
mov_read_subtitle_sample_entry(mov, &track->stsd.entries[i]);
else if (MOV_ALIS == track->handler_type)
mov_read_meta_sample_entry(mov, &track->stsd.entries[i]);
assert(0); // ignore
mov_read_meta_sample_entry(mov, &track->stsd.entries[i]);
return mov_buffer_error(&mov->io);
//static int mov_write_h264(const struct mov_t* mov)
// size_t size;
// uint64_t offset;
// const struct mov_track_t* track = mov->track;
// size = 8 /* Box */;
// offset = mov_buffer_tell(&mov->io);
// mov_buffer_w32(&mov->io, 0); /* size */
// mov_buffer_w32(&mov->io, MOV_TAG('a', 'v', 'c', 'C'));
// mov_write_size(mov, offset, size); /* update size */
// return size;
static size_t mov_write_btrt(const struct mov_t* mov, const struct mov_sample_entry_t* entry)
mov_buffer_w32(&mov->io, 20); /* size */
mov_buffer_write(&mov->io, "btrt", 4);
mov_buffer_w32(&mov->io, entry->u.bitrate.bufferSizeDB);
mov_buffer_w32(&mov->io, 0x00000014);
mov_buffer_w32(&mov->io, 0x00000014);
//mov_buffer_w32(&mov->io, entry->u.bitrate.maxBitrate);
//mov_buffer_w32(&mov->io, entry->u.bitrate.avgBitrate);
return 20;
static size_t mov_write_video(const struct mov_t* mov, const struct mov_sample_entry_t* entry)
size_t size;
uint64_t offset;
char compressorname[32];
memset(compressorname, 0, sizeof(compressorname));
assert(1 == entry->data_reference_index);
size = 8 /* Box */ + 8 /* SampleEntry */ + 70 /* VisualSampleEntry */;
offset = mov_buffer_tell(&mov->io);
mov_buffer_w32(&mov->io, 0); /* size */
mov_buffer_w32(&mov->io, mov->track->tag); // "h264"
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, entry->data_reference_index); /* Data-reference index */
mov_buffer_w16(&mov->io, 0); /* Reserved / Codec stream version */
mov_buffer_w16(&mov->io, 0); /* Reserved / Codec stream revision (=0) */
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, entry->u.visual.width); /* Video width */
mov_buffer_w16(&mov->io, entry->u.visual.height); /* Video height */
mov_buffer_w32(&mov->io, 0x00480000); /* Horizontal resolution 72dpi */
mov_buffer_w32(&mov->io, 0x00480000); /* Vertical resolution 72dpi */
mov_buffer_w32(&mov->io, 0); /* reserved / Data size (= 0) */
mov_buffer_w16(&mov->io, 1); /* Frame count (= 1) */
// ISO 14496-15:2017 AVCC \012AVC Coding
// ISO 14496-15:2017 HVCC \013HEVC Coding
//mov_buffer_w8(&mov->io, 0 /*strlen(compressor_name)*/); /* compressorname */
mov_buffer_write(&mov->io, compressorname, 32); // fill empty
// ISO/IEC 14496-15:2017 4.5 Template field used (19)
// 0x18 - the video sequence is in color with no alpha
// 0x28 - the video sequence is in grayscale with no alpha
// 0x20 - the video sequence has alpha (gray or color)
mov_buffer_w16(&mov->io, 0x18); /* Reserved */
mov_buffer_w16(&mov->io, 0xffff); /* Reserved */
if(MOV_OBJECT_H264 == entry->object_type_indication)
size += mov_write_avcc(mov);
else if (MOV_OBJECT_H265 == entry->object_type_indication)
size += mov_write_hvcc(mov);
else if (MOV_OBJECT_H266 == entry->object_type_indication)
size += mov_write_vvcc(mov);
else if (MOV_OBJECT_MP4V == entry->object_type_indication || MOV_OBJECT_JPEG == entry->object_type_indication || MOV_OBJECT_PNG == entry->object_type_indication || MOV_OBJECT_JPEG2000 == entry->object_type_indication)
size += mov_write_esds(mov);
else if (MOV_OBJECT_AV1 == entry->object_type_indication)
size += mov_write_av1c(mov);
else if (MOV_OBJECT_VP8 == entry->object_type_indication || MOV_OBJECT_VP9 == entry->object_type_indication)
size += mov_write_vpcc(mov);
//size += mov_write_btrt(mov, entry);
mov_write_size(mov, offset, size); /* update size */
return size;
static size_t mov_write_audio(const struct mov_t* mov, const struct mov_sample_entry_t* entry)
size_t size;
uint64_t offset;
size = 8 /* Box */ + 8 /* SampleEntry */ + 20 /* AudioSampleEntry */;
offset = mov_buffer_tell(&mov->io);
mov_buffer_w32(&mov->io, 0); /* size */
mov_buffer_w32(&mov->io, mov->track->tag); // "mp4a"
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, 1); /* Data-reference index */
/* SoundDescription */
mov_buffer_w16(&mov->io, 0); /* Version */
mov_buffer_w16(&mov->io, 0); /* Revision level */
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, entry->u.audio.channelcount); /* channelcount */
mov_buffer_w16(&mov->io, entry->u.audio.samplesize); /* samplesize */
mov_buffer_w16(&mov->io, 0); /* pre_defined */
mov_buffer_w16(&mov->io, 0); /* reserved / packet size (= 0) */
// https://www.opus-codec.org/docs/opus_in_isobmff.html
// 4.3 Definitions of Opus sample
// OpusSampleEntry:
// 1. The samplesize field shall be set to 16.
// 2. The samplerate field shall be set to 48000<<16.
mov_buffer_w32(&mov->io, entry->u.audio.samplerate); /* samplerate */
if(MOV_OBJECT_AAC == entry->object_type_indication || MOV_OBJECT_MP3 == entry->object_type_indication || MOV_OBJECT_MP1A == entry->object_type_indication)
size += mov_write_esds(mov);
else if(MOV_OBJECT_OPUS == entry->object_type_indication)
size += mov_write_dops(mov);
//size += mov_write_btrt(mov, entry);
mov_write_size(mov, offset, size); /* update size */
return size;
static int mov_write_subtitle(const struct mov_t* mov, const struct mov_sample_entry_t* entry)
int size;
uint64_t offset;
size = 8 /* Box */ + 8 /* SampleEntry */;
offset = mov_buffer_tell(&mov->io);
mov_buffer_w32(&mov->io, 0); /* size */
mov_buffer_w32(&mov->io, mov->track->tag); // "tx3g"
mov_buffer_w32(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, 0); /* Reserved */
mov_buffer_w16(&mov->io, entry->data_reference_index); /* Data-reference index */
if (MOV_TAG('t', 'x', '3', 'g') == mov->track->tag)
size += mov_write_tx3g(mov);
else if (entry->extra_data_size > 0) // unknown type
mov_buffer_write(&mov->io, entry->extra_data, entry->extra_data_size);
size += entry->extra_data_size;
size += mov_write_btrt(mov, entry);
mov_write_size(mov, offset, size); /* update size */
return size;
size_t mov_write_stsd(const struct mov_t* mov)
uint32_t i;
size_t size;
uint64_t offset;
const struct mov_track_t* track = mov->track;
size = 12 /* full box */ + 4 /* entry count */;
offset = mov_buffer_tell(&mov->io);
mov_buffer_w32(&mov->io, 0); /* size */
mov_buffer_write(&mov->io, "stsd", 4);
mov_buffer_w32(&mov->io, 0); /* version & flags */
mov_buffer_w32(&mov->io, track->stsd.entry_count); /* entry count */
for (i = 0; i < track->stsd.entry_count; i++)
((struct mov_track_t*)track)->stsd.current = &track->stsd.entries[i];
if (MOV_VIDEO == track->handler_type)
size += mov_write_video(mov, &track->stsd.entries[i]);
else if (MOV_AUDIO == track->handler_type)
size += mov_write_audio(mov, &track->stsd.entries[i]);
else if (MOV_SUBT == track->handler_type || MOV_TEXT == track->handler_type || MOV_SBTL == track->handler_type)
size += mov_write_subtitle(mov, &track->stsd.entries[i]);
mov_write_size(mov, offset, size); /* update size */
return size;