音视频开发之将aac转化为pcm学习笔记

音频基础知识主要参考文章

关键概念

PCM理解

PCM：又称脉冲编码调制。人耳听到的是模拟信号，pcm是把声音从模拟信号转化为数字信号的技术。原理是用一个固定的频率对模拟信号进行采样，采样后的信号在波形上看就像一串连续的幅值不一的脉冲（脉搏短暂起伏的电冲击），把这些脉冲的幅值按一定的精度进行量化，这些量化后的数值被连续的输出、传输、处理或记录到存储介质中，所有这些组成了数字音频的产生过程（抽样、量化、编码三个过程）。
描述PCM数据的6个参数：

Sample Rate : 采样频率。8kHz(电话)、44.1kHz(CD)、48kHz(DVD)。
Sample Size : 量化位数。通常该值为16-bit。
Number of Channels : 通道个数。常见的音频有立体声(stereo)和单声道(mono)两种类型，立体声包含左声道和右声道。另外还有环绕立体声等其它不太常用的类型。
Sign : 表示样本数据是否是有符号位，比如用一字节表示的样本数据，有符号的话表示范围为-128 ~ 127，无符号是0 ~ 255。
Byte Ordering : 字节序。字节序是little-endian还是big-endian。通常均为little-endian。字节序说明见第4节。
Integer Or Floating Point : 整形或浮点型。大多数格式的PCM样本数据使用整形表示，而在一些对精度要求高的应用方面，使用浮点类型表示PCM样本数据。

音频帧

音频和视频不一样，视频每一帧都是一张图像，音频数据是流式的，不同的编码格式各自不同的编码标准，拿PCM和MP3做一个对比。PCM因为没有压缩，根据采样率位宽等数据可以得到每秒的音频数据，并不需要帧的概念；MP3是因为压缩后信息比较多，则有了类似H264的帧概念，每一个帧都有帧头。

G711

g711是一种由国际电信联盟制定的一套语音压缩标准，主要用于电话语音通信，而人声最大频率一般在3.4kHz，所以只要以8k的采样频率对人声进行采样，就可以保证完全还原原始声音。
g711的内容是将一个13bit或14bit的样本编码成一个8bit的样本。
g711标准主要分两种压缩方法：a-law和mu-law，

a-law：将一个13bit的pcm样本压缩成一个8bit的pcm样本。
mu-law：将一个14bit的pcm样本压缩成一个8bit的pcm样本

注：

音频解码一定要注意设置采样率，采样个数，采样格式，声道数等参数，否则在播放和编码解码时很容易出现错误

代码实现

#include <stdio.h>
using namespace std;

#define MAX_AUDIO_FRAME_SIZE  192000

#ifdef _WIN32
//Windows
extern "C"
{
    
    
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
#include<libswresample/swresample.h>
};
#else
//Linux...
#ifdef __cplusplus
extern "C"
{
    
    
#endif
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
#ifdef __cplusplus
};
#endif
#include <libswresample/swresample.h>
#endif

char filepath[] = "test.aac";
FILE* fp_pcm = fopen("test.pcm", "wb+");

int decode()
{
    
    
	AVFormatContext* formatCtx;//需要avformat_alloc_context();初始化
	int				i, stream_index;
	AVCodecContext* pCodecCtx = NULL;
	AVCodec* pCodec;

	AVFrame* pFrame;//需要av_frame_alloc();初始化
	AVPacket* packet;
	int ret, got_frame;

	avformat_network_init();//加载socket库以及网络加密协议相关的库，为后续使用网络相关提供支持 
	formatCtx = avformat_alloc_context();

	if (avformat_open_input(&formatCtx, filepath, NULL, NULL) != 0) {
    
    
		printf("Couldn't open input stream.\n");
		return -1;
	}

	if (avformat_find_stream_info(formatCtx, NULL) < 0) {
    
    
		//创建视频流,使用复用器解码得到码流，初始化AVStream
		printf("Couldn't find stream information.\n");
		return -1;
	}

	stream_index = -1;
	for (i = 0; i < formatCtx->nb_streams; i++)//nb_streams是输入视频的AVStream 个数
		if (formatCtx->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO) {
    
    //切换到视频流
			stream_index = i;
			break;
		}

	if (stream_index == -1) {
    
    
		printf("Didn't find a video stream.\n");
		return -1;//如果没有找到视频流就退出
	}

	//将AVFormatContext指定的解码器让avcodec_find_decoder()去查找
	pCodecCtx = formatCtx->streams[stream_index]->codec;

	pCodec = avcodec_find_decoder(pCodecCtx->codec_id);//查找解码器id
	if (pCodec == NULL) {
    
    
		printf("Codec not found.\n");
		return -1;
	}
	if (avcodec_open2(pCodecCtx, pCodec, NULL) < 0) {
    
    
		printf("Could not open codec.\n");
		return -1;
	}

	//Output Info-----------------------------
	printf("--------------- File Information ----------------\n");
	av_dump_format(formatCtx, 0, filepath, 0);// 打印关于输入或输出格式的详细信息，例如持续时间，比特率，流，容器，程序，元数据，边数据，编解码器和时基。
	printf("-------------------------------------------------\n");

	pFrame = av_frame_alloc();//注册AvFrame
	packet = (AVPacket*)av_malloc(sizeof(AVPacket));//分配一个AVPacket包的内存
	av_init_packet(packet);

	printf("sample_rate = %d , channels = %d bit_rate = %d\n", pCodecCtx->sample_rate, pCodecCtx->channels, pCodecCtx->bit_rate);
	// ffmpeg - f s16le - ar 44100 - ac 6 - i output.pcm - c:a aac - b : a 375k 123.aac

	//设置转码后输出相关参数
	//采样的布局方式
	uint64_t out_channel_layout = AV_CH_LAYOUT_STEREO;
	//采样个数
	int out_nb_samples = 1024;
	//采样格式
	enum AVSampleFormat  sample_fmt = AV_SAMPLE_FMT_S16;
	//采样率
	int out_sample_rate = 44100;
	//通道数
	int out_channels = av_get_channel_layout_nb_channels(out_channel_layout);
	printf("%d\n", out_channels);
	//创建buffer
	int buffer_size = av_samples_get_buffer_size(NULL, out_channels, out_nb_samples, sample_fmt, 1);

	//注意要用av_malloc
	uint8_t* buffer = (uint8_t*)av_malloc(MAX_AUDIO_FRAME_SIZE * 2);

	int64_t in_channel_layout = av_get_default_channel_layout(pCodecCtx->channels);
	//打开转码器
	struct SwrContext* convert_ctx = swr_alloc();
	//设置转码参数
	convert_ctx = swr_alloc_set_opts(convert_ctx, out_channel_layout, sample_fmt, out_sample_rate, \
		in_channel_layout, pCodecCtx->sample_fmt, pCodecCtx->sample_rate, 0, NULL);
	//初始化转码器
	swr_init(convert_ctx);


	while (av_read_frame(formatCtx, packet) >= 0) {
    
    //拆包

		if (packet->stream_index == stream_index) {
    
    

			ret = avcodec_decode_audio4(pCodecCtx, pFrame, &got_frame, packet);

			if (ret < 0) {
    
    
				printf("Decode Error.\n");
				return -1;
			}
			if (got_frame) {
    
    
				// 写入PCM音频数据到文件
				swr_convert(convert_ctx, &buffer, MAX_AUDIO_FRAME_SIZE, (const uint8_t**)pFrame->data, pFrame->nb_samples);
				printf("pts:%10lld\t packet size:%d\n", packet->pts, packet->size);
				fwrite(buffer, 1, buffer_size, fp_pcm);

			}
		}
		av_free_packet(packet);
	}

	swr_free(&convert_ctx);
	fflush(fp_pcm);
	fclose(fp_pcm);
	fp_pcm = nullptr;

	av_frame_free(&pFrame);
	avcodec_close(pCodecCtx);
	avformat_close_input(&formatCtx);

	return 0;
}

int main(int argc, char* argv[])
{
    
    
	decode();
}