ffmpeg 实现aac转opus

AacToOpus.h

//
// Created by hhy on 2020/11/23.
//

#ifndef FFMPEGTEST_AACTOOPUS_H
#define FFMPEGTEST_AACTOOPUS_H

#include <string>
#include <iostream>
#include <chrono>

#ifdef __cplusplus
extern "C" {
#endif
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/frame.h>
#include <libavutil/mem.h>
#include <libavutil/opt.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
#include <libavutil/frame.h>
#include <libavutil/mem.h>
#include <libswscale/swscale.h>

#ifdef __cplusplus
}
#endif

class AudioDecoder
{
private:
    AVFrame* frame_;
    AVPacket* packet_;
    AVCodecContext* codec_ctx_;
    int codec_id_;
public:
    //Only support "opus"
    AudioDecoder();
    virtual ~AudioDecoder();
    int initialize();
    virtual int decode(AVPacket *pkt, char *buf, int &size);
    AVCodecContext* codec_ctx();
};

class AudioEncoder
{
private:
    int channels_;
    int sampling_rate_;
    AVCodecContext* codec_ctx_;
    int want_bytes_;
    AVFrame* frame_;
public:
    //Only support "aac","opus"
    AudioEncoder(int samplerate, int channelsy);
    virtual ~AudioEncoder();
    int initialize();
    //The encoder wanted bytes to call encode, if > 0, caller must feed the same bytes
    //Call after initialize successed
    int want_bytes();
    virtual int encode(AVPacket *frame, char *buf, int &size);
    AVCodecContext* codec_ctx();
};

class AudioResample
{
private:
    int src_rate_;
    int src_ch_layout_;
    int src_nb_channels_;
    enum AVSampleFormat src_sample_fmt_;
    int src_linesize_;
    int src_nb_samples_;
    uint8_t **src_data_;

    int dst_rate_;
    int dst_ch_layout_;
    int dst_nb_channels_;
    enum AVSampleFormat dst_sample_fmt_;
    int dst_linesize_;
    int dst_nb_samples_;
    uint8_t **dst_data_;

    int max_dst_nb_samples_;
    struct SwrContext *swr_ctx_;
public:
    AudioResample(int src_rate, int src_layout, enum AVSampleFormat src_fmt,
                     int src_nb, int dst_rate, int dst_layout, enum AVSampleFormat dst_fmt);
    virtual ~AudioResample();
    int initialize();
    virtual int resample(AVPacket *pcm, char *buf, int &size);
};

class AacToOpus
{
private:
    AudioDecoder *dec_;
    AudioEncoder *enc_;
    AudioResample *resample_;
    int dst_channels_;
    int dst_samplerate_;
    int size_;
    char *data_;
    int src_codec_;
    int dst_codec_;
    int enc_want_bytes_;
    FILE *src_fp;
public:
    AacToOpus(int channels, int samplerate);
    virtual ~AacToOpus();
    int initialize();
    virtual int transcode(AVPacket *pkt, char **buf, int *buf_len, int &n);
};



#endif //FFMPEGTEST_AACTOOPUS_H

AacToOpus.cpp

//
// Created by hhy on 2020/11/23.
//

#include "AacToOpus.h"
static const int kFrameBufMax   = 40960;
static const int kPacketBufMax  = 8192;
const int kMaxOpusPackets = 8;
// The max size for each OPUS packet.
const int kMaxOpusPacketSize = 4096;

AudioDecoder::AudioDecoder()
{
    frame_ = NULL;
    packet_ = NULL;
    codec_ctx_ = NULL;
}

AudioDecoder::~AudioDecoder()
{
    if (codec_ctx_) {
        avcodec_free_context(&codec_ctx_);
        codec_ctx_ = NULL;
    }
    if (frame_) {
        av_frame_free(&frame_);
        frame_ = NULL;
    }
    if (packet_) {
        av_packet_free(&packet_);
        packet_ = NULL;
    }
}

int AudioDecoder::initialize()
{
    int err = 0;

    const char* codec_name = "aac";
    const AVCodec *codec = avcodec_find_decoder_by_name(codec_name);//avcodec_find_decoder_by_name(codec_name);
    if (!codec) {
        printf("avcodec_find_encoder error!\n");
        return -1;
    }

    codec_ctx_ = avcodec_alloc_context3(codec);
    if (!codec_ctx_) {
        printf("avcodec_alloc_context3 error!\n");
        return -1;
    }

    if (avcodec_open2(codec_ctx_, codec, NULL) < 0) {
        printf("avcodec_open2 error!\n");
        return -1;
    }

    frame_ = av_frame_alloc();
    if (!frame_) {
        printf("av_frame_alloc error!\n");
        return -1;
    }

    packet_ = av_packet_alloc();
    if (!packet_) {
        printf("av_packet_alloc error!\n");
        return -1;
    }

    return err;
}

int AudioDecoder::decode(AVPacket *pkt, char *buf, int &size)
{
    int err = 0;

    packet_->data = (uint8_t *)pkt->data;
    packet_->size = pkt->size;

    int ret = avcodec_send_packet(codec_ctx_, packet_);
    if (ret < 0) {
        return -1;
    }

    int max = size;
    size = 0;
    int i, ch;

    while (ret >= 0) {
        ret = avcodec_receive_frame(codec_ctx_, frame_);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return err;
        } else if (ret < 0) {
            return -1;
        }

        int pcm_size = av_get_bytes_per_sample(codec_ctx_->sample_fmt);
        if (pcm_size < 0) {
            return -1;
        }

//        for (int i = 0; i < frame_->nb_samples; i++) {
//            if (size + pcm_size * codec_ctx_->channels <= max) {
//                memcpy(buf + size,frame_->data[0] + pcm_size*codec_ctx_->channels * i, pcm_size * codec_ctx_->channels);
//                size += pcm_size * codec_ctx_->channels;
//            }
//        }

        int planar = av_sample_fmt_is_planar(codec_ctx_->sample_fmt);
        for (i = 0; i < frame_->nb_samples; i++) {
            for (ch = 0; ch < codec_ctx_->channels; ch++) {
                //fwrite(frame_->data[ch] + pcm_size*i, 1, pcm_size, outfile);
                memcpy(buf + size,frame_->data[0] + pcm_size * i, pcm_size);
                size += pcm_size;
                break;
            }
        }
    }

    return err;
}

AVCodecContext* AudioDecoder::codec_ctx()
{
    return codec_ctx_;
}

AudioEncoder::AudioEncoder(int samplerate, int channels)
        : channels_(channels),
          sampling_rate_(samplerate),
          want_bytes_(0)
{
    codec_ctx_ = NULL;
}

AudioEncoder::~AudioEncoder()
{
    if (codec_ctx_) {
        avcodec_free_context(&codec_ctx_);
    }

    if (frame_) {
        av_frame_free(&frame_);
    }

}

int AudioEncoder::initialize()
{
    int err = 0;

    frame_ = av_frame_alloc();
    if (!frame_) {
        return -1;
    }

    const char* codec_name = "libopus";
    const AVCodec *codec = avcodec_find_encoder_by_name(codec_name); //avcodec_find_encoder(AV_CODEC_ID_OPUS);//AV_CODEC_ID_PCM_MULAW
    if (!codec) {
        return -1;
    }

    codec_ctx_ = avcodec_alloc_context3(codec);
    if (!codec_ctx_) {
        return -1;
    }

    codec_ctx_->sample_rate = sampling_rate_;
    codec_ctx_->channels = channels_;
    codec_ctx_->channel_layout = av_get_default_channel_layout(channels_);
    //codec_ctx_->channel_layout = 3;
    codec_ctx_->bit_rate = 48000;

    codec_ctx_->sample_fmt = AV_SAMPLE_FMT_FLT;//AV_SAMPLE_FMT_S16;//
    //TODO: for more level setting
//    codec_ctx_->compression_level = 1;
//    codec_ctx_->sample_fmt = AV_SAMPLE_FMT_FLTP;
//
    //TODO: The encoder 'opus' is experimental but experimental codecs are not enabled, add '-strict -2' if you want to use it.
    codec_ctx_->strict_std_compliance = -2;


    // TODO: FIXME: Show detail error.
    if (avcodec_open2(codec_ctx_, codec, NULL) < 0) {
        printf("Could not open codec\n");
        return -1;
    }

    // Return number of bytes per sample.
    int n_bytes_per_sample = av_get_bytes_per_sample(codec_ctx_->sample_fmt);
    want_bytes_ = codec_ctx_->channels * codec_ctx_->frame_size * n_bytes_per_sample;
    printf("want_bytes_:%d\n", want_bytes_);

    frame_->format = codec_ctx_->sample_fmt;
    frame_->nb_samples = codec_ctx_->frame_size;
    frame_->channel_layout = codec_ctx_->channel_layout;

    if (av_frame_get_buffer(frame_, 0) < 0) {
        printf("Could not get audio frame buffer\n");
        return -1;
    }

    return err;
}

int AudioEncoder::want_bytes()
{
    return want_bytes_;
}

int AudioEncoder::encode(AVPacket *frame, char *buf, int &size)
{
    int err = 0;

    if (want_bytes_ > 0 && frame->size != want_bytes_) {
        printf("invalid frame size %d, should be %d\n", frame->size, want_bytes_);
        return -1;
    }

    // TODO: Directly use frame?
    memcpy(frame_->data[0], frame->data, frame->size);

    /* send the frame for encoding */
    int r0 = avcodec_send_frame(codec_ctx_, frame_);
    if (r0 < 0) {
        printf("Error sending the frame to the encoder, %d\n", r0);
        return -1;
    }

    AVPacket pkt;
    av_init_packet(&pkt);
    pkt.data = NULL;
    pkt.size = 0;

    /* read all the available output packets (in general there may be any
     * number of them */
    size = 0;
    while (r0 >= 0) {
        r0 = avcodec_receive_packet(codec_ctx_, &pkt);
        if (r0 == AVERROR(EAGAIN) || r0 == AVERROR_EOF) {
            //printf("Failed AVERROR r0 %d\n", r0);
            break;
        } else if (r0 < 0) {
            printf("Failed during decoding %d\n", r0);
            return -1;
        }

        //TODO: fit encoder out more pkt
        memcpy(buf, pkt.data, pkt.size);
        size = pkt.size;
        av_packet_unref(&pkt);

        // TODO: FIXME: Refine api, got more than one packets.
    }

    return err;
}

AVCodecContext* AudioEncoder::codec_ctx()
{
    return codec_ctx_;
}

AudioResample::AudioResample(int src_rate, int src_layout, enum AVSampleFormat src_fmt,
                             int src_nb, int dst_rate, int dst_layout, AVSampleFormat dst_fmt)
        : src_rate_(src_rate),
          src_ch_layout_(src_layout),
          src_sample_fmt_(src_fmt),
          src_nb_samples_(src_nb),
          dst_rate_(dst_rate),
          dst_ch_layout_(dst_layout),
          dst_sample_fmt_(dst_fmt)
{
    src_nb_channels_ = 0;
    dst_nb_channels_ = 0;
    src_linesize_ = 0;
    dst_linesize_ = 0;
    dst_nb_samples_ = 0;
    src_data_ = NULL;
    dst_data_ = 0;

    max_dst_nb_samples_ = 0;
    swr_ctx_ = NULL;
}

AudioResample::~AudioResample()
{
    if (src_data_) {
        av_freep(&src_data_[0]);
        av_freep(&src_data_);
        src_data_ = NULL;
    }
    if (dst_data_) {
        av_freep(&dst_data_[0]);
        av_freep(&dst_data_);
        dst_data_ = NULL;
    }
    if (swr_ctx_) {
        swr_free(&swr_ctx_);
        swr_ctx_ = NULL;
    }
}

int AudioResample::initialize()
{
    int err = 0;

    swr_ctx_ = swr_alloc();
    if (!swr_ctx_) {
        printf("Failed swr_ctx_ is nil\n");
        return -1;
    }

    av_opt_set_int(swr_ctx_, "in_channel_layout",    src_ch_layout_, 0);
    av_opt_set_int(swr_ctx_, "in_sample_rate",       src_rate_, 0);
    av_opt_set_sample_fmt(swr_ctx_, "in_sample_fmt", src_sample_fmt_, 0);

    av_opt_set_int(swr_ctx_, "out_channel_layout",    dst_ch_layout_, 0);
    av_opt_set_int(swr_ctx_, "out_sample_rate",       dst_rate_, 0);
    av_opt_set_sample_fmt(swr_ctx_, "out_sample_fmt", dst_sample_fmt_, 0);

    int ret;
    if ((ret = swr_init(swr_ctx_)) < 0) {
        printf("Failed to initialize the resampling context\n");
        return -1;
    }

    src_nb_channels_ = av_get_channel_layout_nb_channels(src_ch_layout_);
    ret = av_samples_alloc_array_and_samples(&src_data_, &src_linesize_, src_nb_channels_,
                                             src_nb_samples_, src_sample_fmt_, 0);
    if (ret < 0) {
        printf("Could not allocate source samples\n");
        return -1;
    }

    max_dst_nb_samples_ = dst_nb_samples_ =
            av_rescale_rnd(src_nb_samples_, dst_rate_, src_rate_, AV_ROUND_UP);

    dst_nb_channels_ = av_get_channel_layout_nb_channels(dst_ch_layout_);
    ret = av_samples_alloc_array_and_samples(&dst_data_, &dst_linesize_, dst_nb_channels_,
                                             dst_nb_samples_, dst_sample_fmt_, 0);
    if (ret < 0) {
        printf("Could not allocate destination samples\n");
        return -1;
    }

    return err;
}

int AudioResample::resample(AVPacket *pcm, char *buf, int &size)
{
    int err = 0;

    int ret, plane = 1;
    if (src_sample_fmt_ == AV_SAMPLE_FMT_FLTP) {
        plane = 2;
    }
    if (src_linesize_ * plane < pcm->size || pcm->size < 0) {
        printf("Failed size not ok\n");
        return -1;
    }
    memcpy(src_data_[0], pcm->data, pcm->size);

    dst_nb_samples_ = av_rescale_rnd(swr_get_delay(swr_ctx_, src_rate_) +
                                     src_nb_samples_, dst_rate_, src_rate_, AV_ROUND_UP);
    if (dst_nb_samples_ > max_dst_nb_samples_) {
        av_freep(&dst_data_[0]);
        ret = av_samples_alloc(dst_data_, &dst_linesize_, dst_nb_channels_,
                               dst_nb_samples_, dst_sample_fmt_, 1);
        if (ret < 0) {
            printf("Failed alloc error\n");
            return -1;
        }
        max_dst_nb_samples_ = dst_nb_samples_;
    }

    ret = swr_convert(swr_ctx_, dst_data_, dst_nb_samples_, (const uint8_t **)src_data_, src_nb_samples_);
    if (ret < 0) {
        printf("Failed while converting\"\n");
        return -1;
    }

    int dst_bufsize = av_samples_get_buffer_size(&dst_linesize_, dst_nb_channels_,
                                                 ret, dst_sample_fmt_, 1);
    if (dst_bufsize < 0) {
        printf("Failed Could not get sample buffer size\"\n");
        return -1;
    }

    int max = size;
    size = 0;
    if (max >= dst_bufsize) {
        memcpy(buf, dst_data_[0], dst_bufsize);
        size = dst_bufsize;
    }

    return err;
}


AacToOpus::AacToOpus(int channels, int samplerate)
        : dst_channels_(channels),
          dst_samplerate_(samplerate)
{
    size_ = 0;
    data_ = NULL;

    dec_ = NULL;
    enc_ = NULL;
    resample_ = NULL;

//    src_fp = fopen("./audio.opus", "w+b");
//    if (!src_fp) {
//        printf("Couldn't open output file.\n");
//    }
}

AacToOpus::~AacToOpus()
{
    if (dec_) {
        delete dec_;
        dec_ = nullptr;
    }
    if (enc_) {
        delete enc_;
        enc_ = nullptr;
    }
    if (resample_) {
        delete resample_;
        resample_ = nullptr;
    }
    if (data_) {
        delete data_;
        data_ = nullptr;
    }

//    if (src_fp) {
//        fclose(src_fp);
//    }
}

int AacToOpus::initialize()
{
    int err = 0;

    dec_ = new AudioDecoder();
    if ((err = dec_->initialize()) != 0) {
        return -1;
    }

    enc_ = new AudioEncoder(dst_samplerate_, dst_channels_);
    if ((err = enc_->initialize()) != 0) {
        return -1;
    }

    enc_want_bytes_ = enc_->want_bytes();
    if (enc_want_bytes_ > 0) {
        data_ = new char[enc_want_bytes_];
    }

    return err;
}

int AacToOpus::transcode(AVPacket *pkt, char **buf, int *buf_len, int &n)
{
    int err = 0;

    if (!dec_) {
        return -1;
    }

    int decode_len = kPacketBufMax;
    static char decode_buffer[kPacketBufMax];
    if ((err = dec_->decode(pkt, decode_buffer, decode_len)) != 0) {
        return -1;
    }
    printf("decode len:%d\n", decode_len);
    if (!resample_) {
        int channel_layout = av_get_default_channel_layout(dst_channels_);
        AVCodecContext *codec_ctx = dec_->codec_ctx();
        resample_ = new AudioResample(codec_ctx->sample_rate, (int)codec_ctx->channel_layout, \
                        codec_ctx->sample_fmt, codec_ctx->frame_size, dst_samplerate_, channel_layout, \
                        enc_->codec_ctx()->sample_fmt);
        if ((err = resample_->initialize()) != 0) {
            return -1;
        }
    }

    AVPacket pcm;
    av_init_packet(&pcm);
    pcm.data = (uint8_t *)decode_buffer;
    pcm.size = decode_len;
    int resample_len = kFrameBufMax;
    static char resample_buffer[kFrameBufMax];
    static char encode_buffer[kPacketBufMax];
    if ((err = resample_->resample(&pcm, resample_buffer, resample_len)) != 0) {
        av_packet_unref(&pcm);
        return -1;
    }

    n = 0;

    // We can encode it in one time.
    if (enc_want_bytes_ <= 0) {
        int encode_len;
        pcm.data = (uint8_t *)data_;
        pcm.size = size_;

        if ((err = enc_->encode(&pcm, encode_buffer, encode_len)) != 0) {
            av_packet_unref(&pcm);
            return -1;
        }

        memcpy(buf[n], encode_buffer, encode_len);
        buf_len[n] = encode_len;
        n++;

        av_packet_unref(&pcm);
        return err;
    }

    // Need to refill the sample to data, because the frame size is not matched to encoder.
    int data_left = resample_len;
    if (size_ + data_left < enc_want_bytes_) {
        memcpy(data_ + size_, resample_buffer, data_left);
        size_ += data_left;
        av_packet_unref(&pcm);
        return err;
    }

    int index = 0;
    while (1) {
        data_left = data_left - (enc_want_bytes_ - size_);
        memcpy(data_ + size_, resample_buffer + index, enc_want_bytes_ - size_);
        index += enc_want_bytes_ - size_;
        size_ += enc_want_bytes_ - size_;

        int encode_len;
        pcm.data = (uint8_t *)data_;
        pcm.size = size_;
        if ((err = enc_->encode(&pcm, encode_buffer, encode_len)) != 0) {
            av_packet_unref(&pcm);
            return -1;
        }

        if (encode_len > 0) {
            memcpy(buf[n], encode_buffer, encode_len);
            buf_len[n] = encode_len;
            n++;
        }

        size_ = 0;
        if(!data_left) {
            break;
        }

        if(data_left < enc_want_bytes_) {
            memcpy(data_ + size_, resample_buffer + index, data_left);
            size_ += data_left;
            break;
        }
    }

    av_packet_unref(&pcm);
    return err;
}

main.cpp

#include <iostream>
#include <memory>
#include <list>
#include "AacToOpus.h"

static const int kFrameBufMax   = 40960;
static const int kPacketBufMax  = 8192;
const int kMaxOpusPackets = 8;
// The max size for each OPUS packet.
const int kMaxOpusPacketSize = 4096;

int main(int argc, char* argv[]) {
    std::cout << "Hello, World!" << std::endl;

    const std::string input_ = "输入rtsp url";
    AVFormatContext *pFormatCtx = avformat_alloc_context();
    int audioIndex = -1;
    AVPacket *packet;

    static char* opus_payloads[kMaxOpusPackets];

    AVDictionary *options = nullptr;
    av_dict_set(&options, "rtsp_transport", "tcp", 0);
    std::cout << "url:" << input_.c_str() << std::endl;
    if (avformat_open_input(&pFormatCtx, input_.c_str(), NULL, &options) != 0) {
        printf("Couldn't open input stream.\n");
        if (options) av_dict_free(&options);
        avformat_close_input(&pFormatCtx);
        return -1;
    }

    std::cout << "avformat_find_stream_info start" << std::endl;

    if (avformat_find_stream_info(pFormatCtx, NULL) < 0) {
        printf("Couldn't find stream information.\n");
        if (options) av_dict_free(&options);
        avformat_close_input(&pFormatCtx);
        return -1;
    }

    av_dump_format(pFormatCtx, NULL, input_.c_str(), 0);

    for (int i = 0; i < pFormatCtx->nb_streams; i++) {
        if (pFormatCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            if (audioIndex != -1) {
                continue;
            }
            audioIndex = i;

            uint64_t channel = pFormatCtx->streams[audioIndex]->codec->channels;
            uint64_t sample_rate = pFormatCtx->streams[audioIndex]->codec->sample_rate;
            enum AVMediaType codec_type = pFormatCtx->streams[audioIndex]->codec->codec_type;
            int64_t bit_rate = pFormatCtx->streams[audioIndex]->codec->bit_rate;
            int64_t channel_layout = pFormatCtx->streams[audioIndex]->codec->channel_layout;
            enum AVSampleFormat sample_fmt = pFormatCtx->streams[audioIndex]->codec->sample_fmt;
            //channel:2,sample_rate:48000,codec_type:1,bit_rate:0,channel_layout:3
            std::cout <<"channel:" << channel<<",sample_rate:" << sample_rate<<",codec_type:" << codec_type<<",bit_rate:" << bit_rate<<",channel_layout:" << channel_layout <<std::endl;

        }
    }

    packet = (AVPacket *) av_malloc(sizeof(AVPacket));
    AVFrame *decoded_frame = NULL;

    std::shared_ptr<AacToOpus> opus_ptr;
    opus_ptr.reset(new AacToOpus(2, 48000));

    if (opus_ptr->initialize() != 0) {
        std::cout << "opus init error" << std::endl;
    }


    for (;;) {
        auto time1 = std::chrono::steady_clock::now();
        if (av_read_frame(pFormatCtx, packet) >= 0) {
            if (packet->stream_index == audioIndex) {

                if (packet->size) {
                    if (opus_ptr) {
                        static char* opus_payloads[kMaxOpusPackets];
                        static char opus_packets_cache[kMaxOpusPackets][kMaxOpusPacketSize];
                        opus_payloads[0] = &opus_packets_cache[0][0];
                        for (int i = 1; i < kMaxOpusPackets; i++) {
                            opus_payloads[i] = opus_packets_cache[i];
                        }

                        int nn_opus_packets = 0;
                        int opus_sizes[kMaxOpusPackets];
                        if (opus_ptr->transcode(packet, opus_payloads, opus_sizes, nn_opus_packets) != 0) {
                            std::cout << "opus transcode error" << std::endl;
                        }

                        for (int i = 0; i < nn_opus_packets; i++) {
                            timestamp += 960;
                            std::cout << "opus size:"<< opus_sizes[i] << std::endl;
                            //send data 
                        }
                    }
                    //std::cout << "recv audio dts:" << packet->dts << std::endl;
                }
            }

            av_free_packet(packet);

        } else {
            break;
        }
    }


    if (packet) av_free_packet(packet);
    if (options) av_dict_free(&options);
    avformat_close_input(&pFormatCtx);

    av_frame_free(&decoded_frame);


    return 0;
}

注意：

1.代码主要参考srs的aac转opus

2.aac是双声道只需要获取一个通道数据即可，不然编码后声音出现问题

3.main.cpp代码需要做修改

4.首先可能要重新编译ffmpeg，支持aac，opus，添加./configure --enable-encoder=opus --enable-encoder=libopus --enable-libopus 等选项

ffmpeg 实现aac转opus

注意：

猜你喜欢