Mp4文件解析

MP4可以说是当前最流行的视频格式,要播放一个MP4文件需要首先将其结构给解析出来。MP4的结构往简单了说就是类似于俄罗斯套娃一样的很多box套box,往复杂了说就是很多种类的box,而且还需要做一些解析和计算的操作,下面就按照其结构来分析一下MP4文件里的主要的box.左侧的目录可以清晰地展示出各种box之间的关系。需要注意的是在ISO标准中box的种类非常多,这里只是列举分析了一些比较重要的box.

Mp4Box

首先我们需要先了解一下box的结构。在Mp4中所有的box都包含了一个BoxHeader和BoxData.其中BoxHeader包含了box的类型和整个box的长度,BoxData里面包含了子box或者各种数据。基于这种结构的考虑,我就可以设计出所有box的父类,还需要注意的是Media Data Box的长度有可能超出32位int的范围,在这种情况下,size的大小就被设置为1,然后使用接下来的64位来存储其长度。如果box的type为”uuid”的话,那接下来的16byte就用来存储uuid.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
public class Mp4Box {
public int headSize = 8;
public int size;
public String type;
public long largeSize;
public boolean hasSubBox = true;
public int index;
public long end;

public Mp4Box(byte[] byteBuffer, int start) {
this.index = start;
size = getSize(byteBuffer);
end = start + size;
type = getType(byteBuffer);
if (type.equals("uuid")) {
headSize += 16;
}

if (size == 1) {
largeSize = getLongFromBuffer(byteBuffer);
end = start + largeSize;
headSize += 8;
} else if (size == 0) {
//box extends to the end of file
}
}

Fullbox

在上面我们定义了父类Mp4box,一般情况下box头部的长度都是8byte, 但是还有一种box在头部还有1byte的version和3byte的flags。所以我们需要创建一个Fullbox父类,让所有这种类型的box来继承它。

1
2
3
4
5
6
7
8
9
10
11
public class FullBox extends Mp4Box {
public int version;
public int flag;

public FullBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
version = getIntFromBuffer(byteBuffer, 1);
flag = getIntFromBuffer(byteBuffer, 3);
headSize = 12;
}
}

ftyp

代表了File type box. 一般在文件的开头处(只有固定大小的文件签名可以在其前面),主要包含了该文件brand等,其定义方式如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
public class FtypBox extends FullBox {
public int majorBrand;
public int minor_version;
public int[] compatibleBrands;

public FtypBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
hasSubBox = false;
majorBrand = getIntFromBuffer(byteBuffer, 4);
minor_version = getIntFromBuffer(byteBuffer, 4);

int num = (size - index) / 4;
compatibleBrands = new int[num];
for (int i = 0; i < num; i++) {
compatibleBrands[i] = getIntFromBuffer(byteBuffer, 4);
}

}
}

moov

代表了Movie box,这个box包含了一个mvhd box 和多个trak box(如video,audio等),一般在文件的开头处仅次于ftyp,但是也有放在文件末尾的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
public class MoovBox extends Mp4Box {

public MvhdBox mvhdBox;
public ArrayList<TrakBox> trakBox = new ArrayList<>();

public MoovBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
parseSub(byteBuffer);
}

@Override
public void parseSub(byte[] byteBuffer) {
int subStart = index;
do {
int size = getSize(byteBuffer);
String type = getType(byteBuffer);
if (size > 8) {
if (type.equals("mvhd")) {
mvhdBox = new MvhdBox(byteBuffer, subStart);
} else if (type.equals("trak")) {
trakBox.add(new TrakBox(byteBuffer, subStart));
}
subStart += size;
index = subStart;
} else {
break;
}
} while (subStart < end);
}
}

mvhd

代表了Movie Header box。 包含了整个媒体问题的信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
public class MvhdBox extends FullBox {

public long creationTime;
public long modificationTime;
public int timescale;
public long duration;

public int rate;
public int volume;
public int reserved = 0;
public int[] matrix = new int[9];
public int[] preDefined = new int[6];
public int nextTrackId;

public MvhdBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
if (version == 1) {
creationTime = getLongFromBuffer(byteBuffer);
modificationTime = getLongFromBuffer(byteBuffer);
timescale = getIntFromBuffer(byteBuffer, 4);
duration = getLongFromBuffer(byteBuffer);
} else {
creationTime = getIntFromBuffer(byteBuffer, 4);
modificationTime = getIntFromBuffer(byteBuffer, 4);
timescale = getIntFromBuffer(byteBuffer, 4);
duration = getIntFromBuffer(byteBuffer, 4);
}
rate = getIntFromBuffer(byteBuffer, 4);
volume = getIntFromBuffer(byteBuffer, 2);
//skip reserved
index += 10;

for (int i = 0; i < matrix.length; i++) {
matrix[i] = getIntFromBuffer(byteBuffer, 4);
}
for (int i = 0; i < preDefined.length; i++) {
preDefined[i] = getIntFromBuffer(byteBuffer, 4);

}
nextTrackId = getIntFromBuffer(byteBuffer, 4);
}
}
  • creationTime: 一个int型的数据代表了文件的创建时间(从1904年1月1日凌晨开始的秒数)
  • modificationTime:修改时间,定义同上
  • timescale:1秒内包含的时间单位,和下面的duration结合可以得到媒体的时长。

    • duration:媒体的时长,以时长最长的track为准

    • rate:播放的速度,1.0为正常速度

    • volume:播放的音量
    • matrix:视频的转换矩阵
      • nextTrackId:下一个track的id

        trak

        代表了 Track box。包含了一条track。通常情况下一个视频会包含video和audio两条track,还有一条hint track是为streaming准备的。
        一个trak box内包含了tkdh和mdia两种box。
        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        public class TrakBox extends Mp4Box {
        public TkhdBox mTkhdBox;
        public MdiaBox mMdiaBox;

        public TrakBox(byte[] byteBuffer, int start) {
        super(byteBuffer, start);
        parseSub(byteBuffer);
        }

        @Override
        public void parseSub(byte[] byteBuffer) {
        int subStart = index;
        do {
        int size = getSize(byteBuffer);
        String type = getType(byteBuffer);
        if (size > 8) {
        if (type.equals("tkhd")) {
        mTkhdBox = new TkhdBox(byteBuffer, subStart);
        } else if (type.equals("mdia")) {
        mMdiaBox = new MdiaBox(byteBuffer, subStart);
        }
        subStart += size;
        index = subStart;
        } else {
        break;
        }
        } while (subStart < end);
        }
        }

tkhd

代表了Track Header Box。每个trck box会包含一个tkhd box来存储这条track的信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
public class TkhdBox extends FullBox {
public long creationTime;
public long modificationTime;
public int trackId;
public long duration;

public int layer;
public int alternateGroup;
public int volume;
public int[] matrix = new int[9];
public int width;
public int height;

public TkhdBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
if (version == 1) {
creationTime = getLongFromBuffer(byteBuffer);
modificationTime = getLongFromBuffer(byteBuffer);
trackId = getIntFromBuffer(byteBuffer, 4);
index += 4;
duration = getLongFromBuffer(byteBuffer);
} else {
creationTime = getIntFromBuffer(byteBuffer, 4);
modificationTime = getIntFromBuffer(byteBuffer, 4);
trackId = getIntFromBuffer(byteBuffer, 4);
index += 4;
duration = getIntFromBuffer(byteBuffer, 4);
}
//skip reserved
index += 8;

layer = getIntFromBuffer(byteBuffer, 2);
alternateGroup = getIntFromBuffer(byteBuffer, 2);
volume = getIntFromBuffer(byteBuffer, 2);

index += 2;
for (int i = 0; i < matrix.length; i++) {
matrix[i] = getIntFromBuffer(byteBuffer, 4);
}
width = getIntFromBuffer(byteBuffer, 4);
height = getIntFromBuffer(byteBuffer, 4);
Logger.i(toString());
}
}
`

  • creationTime: 这条track的创建时间(同mvhd)
  • modificationTime: 这条track的更改时间(同nvhd)
  • trackId: 当前track的id,不能为0
  • duration: 当前track的时长
  • layer: 播放时video track的前后顺序,数字越小的越在上层。
  • alternateGroup: 对track进行分组,同一个组内同时只能播放一个track
  • volume: 音量大小,对于video track 会是0
  • matrix:video的转换矩阵
  • width/height 视频的宽度和长度,以像素为单位

mdia

代表了Media Box, 用来包含当前track的信息。mdia baox主要有三个子box,分别是hdlr、mdhd和minf,我们会在接下来分析这几个子box。
其中componentName并不是mdia box需要解析的内容,在这里我们用来存储当前track的类型,如是vedio还是audio。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
public class MdiaBox extends Mp4Box {
public HdlrBox mHdlrBox;
public MdhdBox mMdhdBox;
public MinfBox mMinfBox;
public String componentName;

public MdiaBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
parseSub(byteBuffer);
}

@Override
public void parseSub(byte[] byteBuffer) {
int subStart = index;
do {
int size = getSize(byteBuffer);
String type = getType(byteBuffer);
if (size > 8) {
if (type.equals("hdlr")) {
mHdlrBox = new HdlrBox(byteBuffer, subStart);
componentName = mHdlrBox.componentName;
} else if (type.equals("mdhd")) {
mMdhdBox = new MdhdBox(byteBuffer, subStart);
} else if (type.equals("minf")) {
mMinfBox = new MinfBox(byteBuffer, subStart, componentName);
}
subStart += size;
index = subStart;
} else {
break;
}
} while (subStart < end);
}
}

mdhd

代表了Media Header Box。包含了一些当前track的信息,如creationTime等,在上文已经提过了,这里将不再赘述。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
public class MdhdBox extends FullBox {
public long creationTime;
public long modificationTime;
public int timescale;
public long duration;
public int language;
public int predefined;
public MdhdBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
if (version == 1) {
creationTime = getLongFromBuffer(byteBuffer);
modificationTime = getLongFromBuffer(byteBuffer);
timescale = getIntFromBuffer(byteBuffer, 4);
duration = getLongFromBuffer(byteBuffer);
} else {
creationTime = getIntFromBuffer(byteBuffer, 4);
modificationTime = getIntFromBuffer(byteBuffer, 4);
timescale = getIntFromBuffer(byteBuffer, 4);
duration = getIntFromBuffer(byteBuffer, 4);
}
language = getIntFromBuffer(byteBuffer, 2);
predefined = getIntFromBuffer(byteBuffer, 2);
}
}
    • language 当前媒体的语言代码

hdlr

代表了Handler Refrence Box。包含了当前track类型的信息,如这条track是video、sound还是hint。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
public class HdlrBox extends FullBox {
public int componentType;
public int componentSubType;
public String componentName;

public HdlrBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);

componentType = getIntFromBuffer(byteBuffer, 4);
componentSubType = getIntFromBuffer(byteBuffer, 4);
index += 12;
componentName = getStringFromBuffer(byteBuffer, 12);
}
}

minf

代表了Media Information Box。 内部主要包含了一个stbl box。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
public class MinfBox extends Mp4Box {
StblBox mStblBox;
private String mType;

public MinfBox(byte[] byteBuffer, int start, String type) {
super(byteBuffer, start);
mType = type;
parseSub(byteBuffer);
}

@Override
public void parseSub(byte[] byteBuffer) {
int subStart = index;
do {
int size = getSize(byteBuffer);
String type = getType(byteBuffer);
if (size > 8) {
if (type.equals("stbl")) {
mStblBox = new StblBox(byteBuffer, subStart,mType);
}
subStart += size;
index = subStart;
} else {
break;
}
} while (subStart < end);
}
}

stbl

代表了Sample Table Box。 这个box包含了当前track中所有Samples的时间和数据索引,所以根据这些信息就可以定位某个时间点的Sample及其大小等信息。
这个box包含了很多子box,将各个子box的信息结合起来就得得到Samples的详细信息了。具体怎么计算将在最后做一下分析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
public class StblBox extends Mp4Box {
private final String mType;
public StsdBox mStsdBox;
public StssBox mStssBox;
public SttsBox mSttsBox;
public StszBox mStszBox;
public StscBox mStscBox;
public StcoBox mStcoBox;
public Co64Box mCo64Box;
public CttsBox mCttsBox;

public StblBox(byte[] byteBuffer, int start, String type) {
super(byteBuffer, start);
mType = type;
parseSub(byteBuffer);
}

@Override
public void parseSub(byte[] byteBuffer) {
int subStart = index;
do {
int size = getSize(byteBuffer);
String type = getType(byteBuffer);
if (type.equals("stsd")) {
mStsdBox = new StsdBox(byteBuffer, subStart, mType);
} else if (type.equals("stts")) {
mSttsBox = new SttsBox(byteBuffer, subStart);
} else if (type.equals("stsz")) {
mStszBox = new StszBox(byteBuffer, subStart);
} else if (type.equals("stsc")) {
mStscBox = new StscBox(byteBuffer, subStart);
} else if (type.equals("stco")) {
mStcoBox = new StcoBox(byteBuffer, subStart);
} else if (type.equals("Co64")) {
mCo64Box = new Co64Box(byteBuffer, subStart);
} else if (type.equals("ctts")) {
mCttsBox = new CttsBox(byteBuffer, subStart);
}
subStart += size;
index = subStart;
} while (subStart < end);
}
}

stts

代表了Decoding Time to Sample Box。这个box包含了一个表,从这个表里可以根据解码时间来定位sample的序号。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
public class SttsBox extends FullBox {
private final int entryCount;
public int[] sampleCount;
public int[] sampleDelta;

public SttsBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
entryCount = getIntFromBuffer(byteBuffer, 4);
sampleCount = new int[entryCount];
sampleDelta = new int[entryCount];
for (int i = 0; i < entryCount; i++) {
sampleCount[i] = getIntFromBuffer(byteBuffer, 4);
sampleDelta[i] = getIntFromBuffer(byteBuffer, 4);
}
}
}
  • sampleCount: 一个时间段内连续的Sample个数
  • sampleDelta:一个时间段内sample的delta。
    ctts
    代表了Sample Table Box。包含了decoding time 和composition time之间的偏移量。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
public class CttsBox extends FullBox {
private final int mEntryCount;
public int[] sampleCount;
public int[] sampleOffset;

public CttsBox(byte[] byteBuffer, int start) {
super(byteBuffer, start);
mEntryCount = getIntFromBuffer(byteBuffer, 4);
sampleCount = new int[mEntryCount];
sampleOffset = new int[mEntryCount];
for (int i = 0; i < mEntryCount; i++) {
sampleCount[i] = getIntFromBuffer(byteBuffer, 4);
sampleOffset[i] = getIntFromBuffer(byteBuffer, 4);
}
}
}
  • sampleCount: 特定偏移量内连续的Sample个数
  • sampleDelta: CT和DT之间的偏移量,如CT(n)=DT(n)+CTTS(n)
stsd

代表了Sample Description Box,里面包含了一些子box,用来存储编码的信息。根据当前track的类型来决定子box的类型。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
public class StsdBox extends FullBox {
private final String mType;
public int numberOfEntries;
public SampleEntry[] entrys;

public StsdBox(byte[] byteBuffer, int start, String type) {
super(byteBuffer, start);
numberOfEntries = getIntFromBuffer(byteBuffer, 4);
entrys = new SampleEntry[numberOfEntries];
mType = type;
parseSub(byteBuffer);
}

@Override
public void parseSub(byte[] byteBuffer) {
int subStart = index;
for (int i = 0; i < numberOfEntries; i++) {
int size = getSize(byteBuffer);
String type = getType(byteBuffer);
Logger.i("type:" + type);
entrys[i] = mType.startsWith("Video") ? new VisualSampleEntry(byteBuffer, subStart) : new AudioSampleEntry(byteBuffer, subStart);
subStart += size;
}
}
}

####### VisualSampleEntry

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
public class VisualSampleEntry extends SampleEntry {
private final int mWidth;
private final int mHeight;
private final int mHorizeresolution;
private final int mVertresolution;
private final int mFrameCount;
private final String mCompressorName;

public VisualSampleEntry(byte[] byteBuffer, int start) {
super(byteBuffer, start);
index += 16;
mWidth = getIntFromBuffer(byteBuffer, 2);
mHeight = getIntFromBuffer(byteBuffer, 2);
mHorizeresolution = getIntFromBuffer(byteBuffer, 4);
mVertresolution = getIntFromBuffer(byteBuffer, 4);
index += 4;
mFrameCount = getIntFromBuffer(byteBuffer, 2);
mCompressorName = getStringFromBuffer(byteBuffer, 4);
Logger.i(toString());
}
}

//todo