Skip to content

Unify use of 'output' and 'desired' in audio code-base #697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ void AudioEncoder::initializeEncoder(
// well when "-b:a" isn't specified.
avCodecContext_->bit_rate = bitRate.value_or(0);

desiredNumChannels_ = static_cast<int>(numChannels.value_or(wf_.sizes()[0]));
validateNumChannels(*avCodec, desiredNumChannels_);
outNumChannels_ = static_cast<int>(numChannels.value_or(wf_.sizes()[0]));
validateNumChannels(*avCodec, outNumChannels_);
// The avCodecContext layout defines the layout of the encoded output, it's
// not related to the input sampes.
setDefaultChannelLayout(avCodecContext_, desiredNumChannels_);
setDefaultChannelLayout(avCodecContext_, outNumChannels_);

validateSampleRate(*avCodec, sampleRate);
avCodecContext_->sample_rate = sampleRate;
Expand Down Expand Up @@ -304,7 +304,7 @@ void AudioEncoder::encodeInnerLoop(
bool mustConvert =
(srcAVFrame != nullptr &&
(avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP ||
getNumChannels(srcAVFrame) != desiredNumChannels_));
getNumChannels(srcAVFrame) != outNumChannels_));

UniqueAVFrame convertedAVFrame;
if (mustConvert) {
Expand All @@ -315,14 +315,14 @@ void AudioEncoder::encodeInnerLoop(
srcAVFrame->sample_rate, // No sample rate conversion
srcAVFrame->sample_rate,
srcAVFrame,
desiredNumChannels_));
outNumChannels_));
}
convertedAVFrame = convertAudioAVFrameSamples(
swrContext_,
srcAVFrame,
avCodecContext_->sample_fmt,
srcAVFrame->sample_rate, // No sample rate conversion
desiredNumChannels_);
outNumChannels_);
TORCH_CHECK(
convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
"convertedAVFrame->nb_samples=",
Expand Down
4 changes: 2 additions & 2 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ class AudioEncoder {
UniqueAVCodecContext avCodecContext_;
int streamIndex_;
UniqueSwrContext swrContext_;
// TODO-ENCODING: desiredNumChannels should just be part of an options struct,
// TODO-ENCODING: outNumChannels should just be part of an options struct,
// see other TODO above.
int desiredNumChannels_ = -1;
int outNumChannels_ = -1;

const torch::Tensor wf_;

Expand Down
87 changes: 43 additions & 44 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,74 +159,74 @@ namespace {
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4

// Returns:
// - the srcAVFrame's channel layout if srcAVFrame has desiredNumChannels
// - the default channel layout with desiredNumChannels otherwise.
AVChannelLayout getDesiredChannelLayout(
int desiredNumChannels,
// - the srcAVFrame's channel layout if srcAVFrame has outNumChannels
// - the default channel layout with outNumChannels otherwise.
AVChannelLayout getOutputChannelLayout(
int outNumChannels,
const UniqueAVFrame& srcAVFrame) {
AVChannelLayout desiredLayout;
if (desiredNumChannels == getNumChannels(srcAVFrame)) {
desiredLayout = srcAVFrame->ch_layout;
AVChannelLayout outLayout;
if (outNumChannels == getNumChannels(srcAVFrame)) {
outLayout = srcAVFrame->ch_layout;
} else {
av_channel_layout_default(&desiredLayout, desiredNumChannels);
av_channel_layout_default(&outLayout, outNumChannels);
}
return desiredLayout;
return outLayout;
}

#else

// Same as above
int64_t getDesiredChannelLayout(
int desiredNumChannels,
int64_t getOutputChannelLayout(
int outNumChannels,
const UniqueAVFrame& srcAVFrame) {
int64_t desiredLayout;
if (desiredNumChannels == getNumChannels(srcAVFrame)) {
desiredLayout = srcAVFrame->channel_layout;
int64_t outLayout;
if (outNumChannels == getNumChannels(srcAVFrame)) {
outLayout = srcAVFrame->channel_layout;
} else {
desiredLayout = av_get_default_channel_layout(desiredNumChannels);
outLayout = av_get_default_channel_layout(outNumChannels);
}
return desiredLayout;
return outLayout;
}
#endif
} // namespace

// Sets dstAVFrame' channel layout to getDesiredChannelLayout(): see doc above
// Sets dstAVFrame' channel layout to getOutputChannelLayout(): see doc above
void setChannelLayout(
UniqueAVFrame& dstAVFrame,
const UniqueAVFrame& srcAVFrame,
int desiredNumChannels) {
int outNumChannels) {
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
AVChannelLayout desiredLayout =
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &desiredLayout);
AVChannelLayout outLayout =
getOutputChannelLayout(outNumChannels, srcAVFrame);
auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &outLayout);
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't copy channel layout to avFrame: ",
getFFMPEGErrorStringFromErrorCode(status));
#else
dstAVFrame->channel_layout =
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
dstAVFrame->channels = desiredNumChannels;
getOutputChannelLayout(outNumChannels, srcAVFrame);
dstAVFrame->channels = outNumChannels;
#endif
}

SwrContext* createSwrContext(
AVSampleFormat srcSampleFormat,
AVSampleFormat desiredSampleFormat,
AVSampleFormat outSampleFormat,
int srcSampleRate,
int desiredSampleRate,
int outSampleRate,
const UniqueAVFrame& srcAVFrame,
int desiredNumChannels) {
int outNumChannels) {
SwrContext* swrContext = nullptr;
int status = AVSUCCESS;
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
AVChannelLayout desiredLayout =
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
AVChannelLayout outLayout =
getOutputChannelLayout(outNumChannels, srcAVFrame);
status = swr_alloc_set_opts2(
&swrContext,
&desiredLayout,
desiredSampleFormat,
desiredSampleRate,
&outLayout,
outSampleFormat,
outSampleRate,
&srcAVFrame->ch_layout,
srcSampleFormat,
srcSampleRate,
Expand All @@ -238,13 +238,12 @@ SwrContext* createSwrContext(
"Couldn't create SwrContext: ",
getFFMPEGErrorStringFromErrorCode(status));
#else
int64_t desiredLayout =
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
int64_t outLayout = getOutputChannelLayout(outNumChannels, srcAVFrame);
swrContext = swr_alloc_set_opts(
nullptr,
desiredLayout,
desiredSampleFormat,
desiredSampleRate,
outLayout,
outSampleFormat,
outSampleRate,
srcAVFrame->channel_layout,
srcSampleFormat,
srcSampleRate,
Expand All @@ -267,19 +266,19 @@ SwrContext* createSwrContext(
UniqueAVFrame convertAudioAVFrameSamples(
const UniqueSwrContext& swrContext,
const UniqueAVFrame& srcAVFrame,
AVSampleFormat desiredSampleFormat,
int desiredSampleRate,
int desiredNumChannels) {
AVSampleFormat outSampleFormat,
int outSampleRate,
int outNumChannels) {
UniqueAVFrame convertedAVFrame(av_frame_alloc());
TORCH_CHECK(
convertedAVFrame,
"Could not allocate frame for sample format conversion.");

convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
convertedAVFrame->format = static_cast<int>(outSampleFormat);

convertedAVFrame->sample_rate = desiredSampleRate;
convertedAVFrame->sample_rate = outSampleRate;
int srcSampleRate = srcAVFrame->sample_rate;
if (srcSampleRate != desiredSampleRate) {
if (srcSampleRate != outSampleRate) {
// Note that this is an upper bound on the number of output samples.
// `swr_convert()` will likely not fill convertedAVFrame with that many
// samples if sample rate conversion is needed. It will buffer the last few
Expand All @@ -290,14 +289,14 @@ UniqueAVFrame convertAudioAVFrameSamples(
// tighter bound.
convertedAVFrame->nb_samples = av_rescale_rnd(
swr_get_delay(swrContext.get(), srcSampleRate) + srcAVFrame->nb_samples,
desiredSampleRate,
outSampleRate,
srcSampleRate,
AV_ROUND_UP);
} else {
convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
}

setChannelLayout(convertedAVFrame, srcAVFrame, desiredNumChannels);
setChannelLayout(convertedAVFrame, srcAVFrame, outNumChannels);

auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
TORCH_CHECK(
Expand Down
31 changes: 15 additions & 16 deletions src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,11 +1186,11 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
FrameOutput& frameOutput) {
AVSampleFormat srcSampleFormat =
static_cast<AVSampleFormat>(srcAVFrame->format);
AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
AVSampleFormat outSampleFormat = AV_SAMPLE_FMT_FLTP;

StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
int srcSampleRate = srcAVFrame->sample_rate;
int desiredSampleRate =
int outSampleRate =
streamInfo.audioStreamOptions.sampleRate.value_or(srcSampleRate);

int srcNumChannels = getNumChannels(streamInfo.codecContext);
Expand All @@ -1203,50 +1203,49 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
". If you are hitting this, it may be because you are using "
"a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
"valid scenarios. Try to upgrade FFmpeg?");
int desiredNumChannels =
int outNumChannels =
streamInfo.audioStreamOptions.numChannels.value_or(srcNumChannels);

bool mustConvert =
(srcSampleFormat != desiredSampleFormat ||
srcSampleRate != desiredSampleRate ||
srcNumChannels != desiredNumChannels);
(srcSampleFormat != outSampleFormat || srcSampleRate != outSampleRate ||
srcNumChannels != outNumChannels);

UniqueAVFrame convertedAVFrame;
if (mustConvert) {
if (!streamInfo.swrContext) {
streamInfo.swrContext.reset(createSwrContext(
srcSampleFormat,
desiredSampleFormat,
outSampleFormat,
srcSampleRate,
desiredSampleRate,
outSampleRate,
srcAVFrame,
desiredNumChannels));
outNumChannels));
}

convertedAVFrame = convertAudioAVFrameSamples(
streamInfo.swrContext,
srcAVFrame,
desiredSampleFormat,
desiredSampleRate,
desiredNumChannels);
outSampleFormat,
outSampleRate,
outNumChannels);
}
const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;

AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
TORCH_CHECK(
format == desiredSampleFormat,
format == outSampleFormat,
"Something went wrong, the frame didn't get converted to the desired format. ",
"Desired format = ",
av_get_sample_fmt_name(desiredSampleFormat),
av_get_sample_fmt_name(outSampleFormat),
"source format = ",
av_get_sample_fmt_name(format));

int numChannels = getNumChannels(avFrame);
TORCH_CHECK(
numChannels == desiredNumChannels,
numChannels == outNumChannels,
"Something went wrong, the frame didn't get converted to the desired ",
"number of channels = ",
desiredNumChannels,
outNumChannels,
". Got ",
numChannels,
" instead.");
Expand Down
Loading