FFmpeg provides audio conversion functions in the libswresample module, and the previously used libavresample module is outdated. According to the official documentation: libswresample provides deeply optimized audio resampling, channel layout conversion and format conversion. The audio resampling process is to first build up the original audio signal and then resample it. Resampling is divided into upsampling and downsampling, where upsampling requires interpolation, and downsampling requires decimation. Converting from high to low sample rates is a lossy process, and FFmpeg provides several options and algorithms for resampling.
1. Introduction to libswresample module
FFmpeg introduces the libswresample module, including resampling, format conversion and channel layout conversion. For specific documents, please refer to: https://ffmpeg.org/libswresample.html . The specific description is as follows:
The libswresample library performs highly optimized audio resampling, rematrixing and sample format conversion operations.
Specifically, this library performs the following conversions:
Resampling: is the process of changing the audio rate, for example from a high sample rate of 44100Hz to 8000Hz.
Audio conversion from high to low sample rate is a lossy process. Several resampling options and algorithms are available.
Format conversion: is the process of converting the type of samples, for example from 16-bit signed samples to unsigned 8-bit or float samples.
It also handles packing conversion, when passing from packed layout.
Rematrixing: is the process of changing the channel layout, for example from stereo to mono.
When the input channels cannot be mapped to the output streams, the process is lossy, since it involves different gain factors and mixing.
2. SwrContext structure
SwrContext is the structure of audio conversion, located in the swresample_internal.h header file:
struct SwrContext {
enum AVSampleFormat in_sample_fmt; // input sample format
enum AVSampleFormat int_sample_fmt; // internal sample format
enum AVSampleFormat out_sample_fmt; // output sample format
int64_t in_ch_layout; // input channel layout
int64_t out_ch_layout; // output channel layout
int in_sample_rate; // input sample rate
int out_sample_rate; // output sample rate
int flags; // miscellaneous flags such as SWR_FLAG_RESAMPLE
float slev; // surround mixing level
float clev; // center mixing level
float lfe_mix_level; // LFE mixing level
float rematrix_volume; // rematrixing volume coefficient
float rematrix_maxval; // maximum value for rematrixing output
int matrix_encoding; // matrixed stereo encoding
const int *channel_map; // channel index (or -1 if muted channel) map
int used_ch_count; // number of used input channels
int engine;
int user_in_ch_count; // User set input channel count
int user_out_ch_count; // User set output channel count
int user_used_ch_count; // User set used channel count
int64_t user_in_ch_layout; // User set input channel layout
int64_t user_out_ch_layout; // User set output channel layout
enum AVSampleFormat user_int_sample_fmt; // User set internal sample format
int user_dither_method; // User set dither method
struct DitherContext dither;
int filter_size; // length of each FIR filter relative to the cutoff frequency
int phase_shift; // log2 of the number of entries in the resampling polyphase filterbank
int linear_interp; // if 1 then the resampling FIR filter will be linearly interpolated
int exact_rational; // if 1 then enable non power of 2 phase_count
double cutoff; // resampling cutoff frequency
int filter_type; // swr resampling filter type
double kaiser_beta; // swr beta value for Kaiser window
float min_compensation; // swr minimum below which no compensation will happen
float min_hard_compensation; // swr minimum below which no silence inject / sample drop will happen
float soft_compensation_duration; // swr duration over which soft compensation is applied
float max_soft_compensation; // swr maximum soft compensation in seconds
float async; // swr simple 1 parameter async, similar to ffmpegs -async
int64_t firstpts_in_samples; // swr first pts in samples
int resample_first; // 1 if resampling must come first, 0 if rematrixing
int rematrix; // flag to indicate if rematrixing is needed
int rematrix_custom; // flag to indicate that a custom matrix has been defined
AudioData in; // input audio data
AudioData postin; // post-input audio data: used for rematrix/resample
AudioData midbuf; // intermediate audio data
AudioData preout; // pre-output audio data: used for rematrix/resample
AudioData out; // converted output audio data
AudioData in_buffer;
AudioData silence;
struct AudioConvert *in_convert; // input conversion context
struct AudioConvert *out_convert; // output conversion context
struct AudioConvert *full_convert; // full conversion context
struct ResampleContext *resample; // resampling context
struct Resampler const *resampler; // resampler virtual function table
double matrix[SWR_CH_MAX][SWR_CH_MAX]; // floating point rematrixing coefficients
float matrix_flt[SWR_CH_MAX][SWR_CH_MAX]; // rematrixing coefficients
int32_t matrix32[SWR_CH_MAX][SWR_CH_MAX]; // 17.15 fixed point rematrixing coefficients
uint8_t matrix_ch[SWR_CH_MAX][SWR_CH_MAX+1]; // Lists of input channels per output channel
};
3、swr_alloc与swr_alloc_set_opts
The swr_alloc() function is used to allocate the SwrContext structure and needs to be called before swr_init(). On the basis of swr_alloc(), swr_alloc_set_opts() configures relevant options parameters, including input and output sampling format, sampling rate, and channel layout. The official usage description is as follows:
SwrContext *swr = swr_alloc();
av_opt_set_channel_layout(swr, "in_channel_layout", AV_CH_LAYOUT_5POINT1, 0);
av_opt_set_channel_layout(swr, "out_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_int(swr, "in_sample_rate", 48000, 0);
av_opt_set_int(swr, "out_sample_rate", 44100, 0);
av_opt_set_sample_fmt(swr, "in_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
// The same job can be done using swr_alloc_set_opts() as well:
SwrContext *swr = swr_alloc_set_opts(NULL, // we're allocating a new context
AV_CH_LAYOUT_STEREO, // out_ch_layout
AV_SAMPLE_FMT_S16, // out_sample_fmt
44100, // out_sample_rate
AV_CH_LAYOUT_5POINT1, // in_ch_layout
AV_SAMPLE_FMT_FLTP, // in_sample_fmt
48000, // in_sample_rate
0, // log_offset
NULL); // log_ctx
4、swr_init
swr_init() is used to initialize the SwrContext context, including incoming parameters for verification, parameter assignment, selection of resamplers, and assignment of input and output converters:
int swr_init(struct SwrContext *s){
int ret;
char l1[1024], l2[1024];
clear_context(s);
// 检查参数
if(s-> in_sample_fmt >= AV_SAMPLE_FMT_NB){
return AVERROR(EINVAL);
}
if(s->out_sample_fmt >= AV_SAMPLE_FMT_NB){
return AVERROR(EINVAL);
}
if(s-> in_sample_rate <= 0){
return AVERROR(EINVAL);
}
if(s->out_sample_rate <= 0){
return AVERROR(EINVAL);
}
// 参数赋值
s->out.ch_count = s-> user_out_ch_count;
s-> in.ch_count = s-> user_in_ch_count;
s->used_ch_count = s->user_used_ch_count;
s-> in_ch_layout = s-> user_in_ch_layout;
s->out_ch_layout = s->user_out_ch_layout;
s->int_sample_fmt= s->user_int_sample_fmt;
s->dither.method = s->user_dither_method;
......
// 选择重采样器
switch(s->engine){
#if CONFIG_LIBSOXR
case SWR_ENGINE_SOXR: s->resampler = &swri_soxr_resampler; break;
#endif
case SWR_ENGINE_SWR : s->resampler = &swri_resampler; break;
default:
av_log(s, AV_LOG_ERROR, "resampling engine is unavailable\n");
return AVERROR(EINVAL);
}
......
// 分配输入输出的转换器
s->in_convert = swri_audio_convert_alloc(s->int_sample_fmt,
s-> in_sample_fmt,
s->used_ch_count,
s->channel_map, 0);
s->out_convert= swri_audio_convert_alloc(s->out_sample_fmt,
s->int_sample_fmt,
s->out.ch_count, NULL, 0);
// 如果需要声道转换,初始化声道转换函数
if(s->rematrix || s->dither.method) {
ret = swri_rematrix_init(s);
if (ret < 0)
goto fail;
}
......
return 0;
fail:
swr_close(s);
return ret;
}
5、swr_convert
swr_convert() mainly calls the internal method swr_convert_internal() for audio conversion. At the end of the audio stream, the in_arg and in_count parameters can be set to 0 to flush the last audio data from the buffer. If the input has multiple samples, it will be buffered in the buffer:
int swr_convert(struct SwrContext *s, uint8_t *out_arg[SWR_CH_MAX],
int out_count, const uint8_t *in_arg [SWR_CH_MAX],
int in_count){
AudioData * in= &s->in;
AudioData *out= &s->out;
int av_unused max_output;
// 判断是否已经初始化
if (!swr_is_initialized(s)) {
return AVERROR(EINVAL);
}
......
if(s->resample){
// 调用内部方法进行音频转换
int ret = swr_convert_internal(s, out, out_count, in, in_count);
if(ret>0 && !s->drop_output)
s->outpts += ret * (int64_t)s->in_sample_rate;
av_assert2(max_output < 0 || ret < 0 || ret <= max_output);
return ret;
}else{
AudioData tmp= *in;
int ret2=0;
int ret, size;
size = FFMIN(out_count, s->in_buffer_count);
if(size){
buf_set(&tmp, &s->in_buffer, s->in_buffer_index);
ret= swr_convert_internal(s, out, size, &tmp, size);
if(ret<0)
return ret;
ret2= ret;
s->in_buffer_count -= ret;
s->in_buffer_index += ret;
buf_set(out, out, ret);
out_count -= ret;
if(!s->in_buffer_count)
s->in_buffer_index = 0;
}
......
return ret2;
}
}
The code of swr_convert_internal() is as follows:
static int swr_convert_internal(struct SwrContext *s,
AudioData *out, int out_count,
AudioData *in , int in_count){
// 如果是全量转换,直接转换,然后返回结果
if(s->full_convert){
swri_audio_convert(s->full_convert, out, in, in_count);
return out_count;
}
// 重新分配缓冲区
if((ret=swri_realloc_audio(&s->postin, in_count))<0)
return ret;
if(s->resample_first){
av_assert0(s->midbuf.ch_count == s->used_ch_count);
if((ret=swri_realloc_audio(&s->midbuf, out_count))<0)
return ret;
}else{
av_assert0(s->midbuf.ch_count == s->out.ch_count);
if((ret=swri_realloc_audio(&s->midbuf, in_count))<0)
return ret;
}
if((ret=swri_realloc_audio(&s->preout, out_count))<0)
return ret;
// 没有转换部分,执行音频转换
if(in != postin){
swri_audio_convert(s->in_convert, postin, in, in_count);
}
if(s->resample_first){
if(postin != midbuf)
out_count= resample(s, midbuf, out_count, postin, in_count);
if(midbuf != preout)
swri_rematrix(s, preout, midbuf, out_count, preout==out);
}else{
if(postin != midbuf)
swri_rematrix(s, midbuf, postin, in_count, midbuf==out);
if(midbuf != preout)
out_count= resample(s, preout, out_count, midbuf, in_count);
}
......
return out_count;
}
The official audio conversion processing demo is as follows:
uint8_t **input;
int in_samples;
while (get_input(&input, &in_samples)) {
uint8_t *output;
int out_samples = av_rescale_rnd(swr_get_delay(swr, 48000)
+ in_samples, 44100, 48000,
AV_ROUND_UP);
av_samples_alloc(&output, NULL, 2, out_samples,
AV_SAMPLE_FMT_S16, 0);
out_samples = swr_convert(swr, &output, out_samples,
input, in_samples);
handle_output(output, out_samples);
av_freep(&output);
}
6、swr_close与swr_free
swr_close() closes the SwrContext context, and swr_free() releases the pointer in addition to closing the context. code show as below:
void swr_free(SwrContext **ss){
SwrContext *s= *ss;
if(s){
clear_context(s);
if (s->resampler)
s->resampler->free(&s->resample);
}
av_freep(ss);
}
void swr_close(SwrContext *s){
clear_context(s);
}