Skip to content

Support all audio formats by converting to FLTP #556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Mar 17, 2025
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
823e9bb
WIP
NicolasHug Mar 13, 2025
846b7b8
Merge branch 'main' of github.com:pytorch/torchcodec into fltp
NicolasHug Mar 13, 2025
979e72a
WIP
NicolasHug Mar 13, 2025
f6ecd32
WIP
NicolasHug Mar 13, 2025
58e5277
Add sample_format to audio metadata
NicolasHug Mar 13, 2025
73af3e7
Merge branch 'sample_format_metadata' into fltp
NicolasHug Mar 13, 2025
94c11fc
Add non-fltp file
NicolasHug Mar 13, 2025
ee30cc3
Merge branch 'sample_format_metadata' into fltp
NicolasHug Mar 13, 2025
9ee63e6
more stuff
NicolasHug Mar 13, 2025
ccd0cb5
Debug, ffmpeg7 only
NicolasHug Mar 14, 2025
6b8e8be
Use new FFmpeg builds
NicolasHug Mar 14, 2025
999757a
Fix
NicolasHug Mar 14, 2025
bb92a9f
Revert "Debug, ffmpeg7 only"
NicolasHug Mar 14, 2025
ba0ca4a
Reapply "Debug, ffmpeg7 only"
NicolasHug Mar 14, 2025
96592a7
Use ffmpeg7 for macos too
NicolasHug Mar 14, 2025
640a82d
Put back everything except ffmpeg4
NicolasHug Mar 14, 2025
8378487
Don't use raw pointer
NicolasHug Mar 14, 2025
f6c70b6
WIP
NicolasHug Mar 14, 2025
de59431
Merge branch 'main' of github.com:pytorch/torchcodec into fltp
NicolasHug Mar 14, 2025
da1796a
Put back FFmpeg4
NicolasHug Mar 14, 2025
2ab9208
lint
NicolasHug Mar 14, 2025
1c4b1f9
AAAAH
NicolasHug Mar 14, 2025
f235cc2
Revert some stuff
NicolasHug Mar 14, 2025
44500ad
Put back flags
NicolasHug Mar 14, 2025
db9ff94
Add one more check
NicolasHug Mar 14, 2025
3da223c
Merge branch 'main' of github.com:pytorch/torchcodec into fltp
NicolasHug Mar 17, 2025
d5b4ec8
Move stuff into ffmpeg.cpp
NicolasHug Mar 17, 2025
07287d5
const cast
NicolasHug Mar 17, 2025
a2b1988
Fix
NicolasHug Mar 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/torchcodec/decoders/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})
)


make_torchcodec_library(libtorchcodec4 ffmpeg4)
make_torchcodec_library(libtorchcodec7 ffmpeg7)
make_torchcodec_library(libtorchcodec6 ffmpeg6)
make_torchcodec_library(libtorchcodec5 ffmpeg5)
make_torchcodec_library(libtorchcodec4 ffmpeg4)

else()
message(
Expand All @@ -97,6 +97,7 @@ else()
libavformat
libavcodec
libavutil
libswresample
libswscale
)

Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/decoders/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ int64_t getDuration(const AVFrame* frame) {
#endif
}

int getNumChannels(const AVFrame* avFrame) {
int getNumChannels(const UniqueAVFrame& avFrame) {
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
return avFrame->ch_layout.nb_channels;
Expand Down
5 changes: 4 additions & 1 deletion src/torchcodec/decoders/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern "C" {
#include <libavutil/opt.h>
#include <libavutil/pixfmt.h>
#include <libavutil/version.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}

Expand Down Expand Up @@ -67,6 +68,8 @@ using UniqueAVIOContext = std::
unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
using UniqueSwsContext =
std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
using UniqueSwrContext =
std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;

// These 2 classes share the same underlying AVPacket object. They are meant to
// be used in tandem, like so:
Expand Down Expand Up @@ -139,7 +142,7 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
int64_t getDuration(const UniqueAVFrame& frame);
int64_t getDuration(const AVFrame* frame);

int getNumChannels(const AVFrame* avFrame);
int getNumChannels(const UniqueAVFrame& avFrame);
int getNumChannels(const UniqueAVCodecContext& avCodecContext);

// Returns true if sws_scale can handle unaligned data.
Expand Down
155 changes: 134 additions & 21 deletions src/torchcodec/decoders/_core/VideoDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ extern "C" {
#include <libavutil/imgutils.h>
#include <libavutil/log.h>
#include <libavutil/pixdesc.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}

Expand Down Expand Up @@ -558,6 +559,12 @@ void VideoDecoder::addAudioStream(int streamIndex) {
static_cast<int64_t>(streamInfo.codecContext->sample_rate);
streamMetadata.numChannels =
static_cast<int64_t>(getNumChannels(streamInfo.codecContext));

// FFmpeg docs say that the decoder will try to decode natively in this
// format, if it can. Docs don't say what the decoder does when it doesn't
// support that format, but it looks like it does nothing, so this probably
// doesn't hurt.
streamInfo.codecContext->request_sample_fmt = AV_SAMPLE_FMT_FLTP;
}

// --------------------------------------------------------------------------
Expand Down Expand Up @@ -1342,37 +1349,93 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
!preAllocatedOutputTensor.has_value(),
"pre-allocated audio tensor not supported yet.");

const AVFrame* avFrame = avFrameStream.avFrame.get();
AVSampleFormat sourceSampleFormat =
static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;

UniqueAVFrame convertedAVFrame;
if (sourceSampleFormat != desiredSampleFormat) {
convertedAVFrame = convertAudioAVFrameSampleFormat(
avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
}
const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
? convertedAVFrame
: avFrameStream.avFrame;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above: I wasn't able to use the call to convertAudioAVFrameSampleFormat within the ternary expression.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alternative would be to define something like:

UniqueAVFrame maybeConvertAVFrame(
  UniqueAVFrame frame,
  AVSampleFormat source,
  AVSampleFormat desired) {
  if (source != desired) {
    return convertAudioAVFrameSampleFormat(frame, source, desired);
  else {
    return frame;
  }
}

And then called with:

UniqueAVFrame avFrame = maybeConvertAVFrame(
  avFrameStream.avFrame, 
  sourceSampleFormat,
  desiredSampleFormat);

That's not necessarily better because there's more ceremony in declaring and defining a new function (although it does not need to be a class member function) and then it's not necessarily obvious what's going on with the sample format comparison.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I just realized that convertAudioAVFrameSampleFormat() already has that exact signature. I think we could simplify things a lot if convertAudioAVFrameSampleFormat() became maybe ConvertAudioAVFrameSampleFormat() (or a better name?) and only did the conversion if needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't able to make it work.

The function cannot take a UniqueAVFrame avFrame as input, it has to be a ref (or const ref).

Then it has to return either avFrame (which is a ref) or convertedAVFrame which is a UniqueAVFrame allocated within that function.

I tried a few things, but nothing worked. It's possible I'm doing something wrong.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example the following:

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 9871db6..3003792 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1361,14 +1361,8 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
       static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
   AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
 
-  UniqueAVFrame convertedAVFrame;
-  if (sourceSampleFormat != desiredSampleFormat) {
-    convertedAVFrame = convertAudioAVFrameSampleFormat(
-        avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
-  }
-  const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
-      ? convertedAVFrame
-      : avFrameStream.avFrame;
+  const UniqueAVFrame& avFrame = convertAudioAVFrameSampleFormat(
+      avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
   TORCH_CHECK(
@@ -1397,9 +1391,11 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
 UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
     const UniqueAVFrame& avFrame,
     AVSampleFormat sourceSampleFormat,
-    AVSampleFormat desiredSampleFormat
+    AVSampleFormat desiredSampleFormat) {
+  if (sourceSampleFormat == desiredSampleFormat) {
+    return avFrame;
+  }
 
-) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];

yields:

  /home/nicolashug/dev/torchcodec/src/torchcodec/decoders/_core/VideoDecoder.cpp:1396:12: error: use of deleted function ‘std::unique_ptr<_Tp, _Dp>::unique_ptr(const std::unique_ptr<_Tp, _Dp>&) [with _Tp = AVFrame; _Dp = facebook::torchcodec::Deleterp<AVFrame, void, av_frame_free>]’
   1396 |     return avFrame;
        |            ^~~~~~~

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there may be a way to do it, but it requires doing some std::move() calls. I've had to do that on both sides: going in and out of functions. Let's commit this code as-is, and I can try playing with it later.


AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
TORCH_CHECK(
format == desiredSampleFormat,
"Something went wrong, the frame didn't get converted to the desired format. ",
"Desired format = ",
av_get_sample_fmt_name(desiredSampleFormat),
"source format = ",
av_get_sample_fmt_name(format));

auto numSamples = avFrame->nb_samples; // per channel
auto numChannels = getNumChannels(avFrame);
torch::Tensor outputData =
torch::empty({numChannels, numSamples}, torch::kFloat32);

AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
// TODO-AUDIO Implement all formats.
switch (format) {
case AV_SAMPLE_FMT_FLTP: {
uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
for (auto channel = 0; channel < numChannels;
++channel, outputChannelData += numBytesPerChannel) {
memcpy(
outputChannelData,
avFrame->extended_data[channel],
numBytesPerChannel);
}
break;
}
default:
TORCH_CHECK(
false,
"Unsupported audio format (yet!): ",
av_get_sample_fmt_name(format));
uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
for (auto channel = 0; channel < numChannels;
++channel, outputChannelData += numBytesPerChannel) {
memcpy(
outputChannelData, avFrame->extended_data[channel], numBytesPerChannel);
}
frameOutput.data = outputData;
}

UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
const UniqueAVFrame& avFrame,
AVSampleFormat sourceSampleFormat,
AVSampleFormat desiredSampleFormat

) {
auto& streamInfo = streamInfos_[activeStreamIndex_];
const auto& streamMetadata =
containerMetadata_.allStreamMetadata[activeStreamIndex_];
int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());

if (!streamInfo.swrContext) {
createSwrContext(
streamInfo, sampleRate, sourceSampleFormat, desiredSampleFormat);
}

UniqueAVFrame convertedAVFrame(av_frame_alloc());
TORCH_CHECK(
convertedAVFrame,
"Could not allocate frame for sample format conversion.");

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Below: I realize we have previously avoided having any ffmpeg-related pre-proc directives in this file. I wonder however if it's really worth creating a separate util for this one line? I personally find the code easier to read this way, but no strong opinion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to consolidate all FFmpeg pre-proc directives in the FFmpeg utility. Most of them are just one line, but we already have 5 or so of them, and I anticipate having more. If we don't consolidate, I think we'd end up having the directives all over VideoDecoder.cpp which I think harms readability.

Copy link
Member Author

@NicolasHug NicolasHug Mar 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will consolidate within FFmpegCommon.h, but I think we could approach this with a more case-by-case basis. If code readability is the main factor, in that specific instance, I personally do not think that the resulting code is more readable:

  setChannelLayout(convertedAVFrame, avFrame);
  convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
  convertedAVFrame->sample_rate = avFrame->sample_rate;
  convertedAVFrame->nb_samples = avFrame->nb_samples;

I.e. it's not obvious that setChannelLayout just sets a single field and is part of the same logic as the 3 following lines.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it's not great, and I think the answer may be for us to implement them as methods on our unique wrappers. Something like:

convertedAVFrame->setChannelLayout(srcFrame);

But that will require us to define the wrapper logic ourselves, as opposed to just making them an alias to std::unique_ptr.

#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
convertedAVFrame->ch_layout = avFrame->ch_layout;
#else
convertedAVFrame->channel_layout = avFrame->channel_layout;
#endif
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
convertedAVFrame->sample_rate = avFrame->sample_rate;
convertedAVFrame->nb_samples = avFrame->nb_samples;

auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
TORCH_CHECK(
status == AVSUCCESS,
"Could not allocate frame buffers for sample format conversion: ",
getFFMPEGErrorStringFromErrorCode(status));

auto numSampleConverted = swr_convert(
streamInfo.swrContext.get(),
convertedAVFrame->data,
convertedAVFrame->nb_samples,
(const uint8_t**)avFrame->data,
avFrame->nb_samples);
TORCH_CHECK(
numSampleConverted > 0,
"Error in swr_convert: ",
getFFMPEGErrorStringFromErrorCode(numSampleConverted));

return convertedAVFrame;
}

// --------------------------------------------------------------------------
// OUTPUT ALLOCATION AND SHAPE CONVERSION
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -1606,6 +1669,56 @@ void VideoDecoder::createSwsContext(
streamInfo.swsContext.reset(swsContext);
}

void VideoDecoder::createSwrContext(
StreamInfo& streamInfo,
int sampleRate,
AVSampleFormat sourceSampleFormat,
AVSampleFormat desiredSampleFormat) {
SwrContext* swrContext = nullptr;

int status = AVSUCCESS;
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
AVChannelLayout layout = streamInfo.codecContext->ch_layout;
status = swr_alloc_set_opts2(
&swrContext,
&layout,
desiredSampleFormat,
sampleRate,
&layout,
sourceSampleFormat,
sampleRate,
0,
nullptr);

TORCH_CHECK(
status == AVSUCCESS,
"Couldn't create SwrContext: ",
getFFMPEGErrorStringFromErrorCode(status));
#else
int64_t layout =
static_cast<int64_t>(streamInfo.codecContext->channel_layout);
swrContext = swr_alloc_set_opts(
nullptr,
layout,
desiredSampleFormat,
sampleRate,
layout,
sourceSampleFormat,
sampleRate,
0,
nullptr);
#endif

TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");

status = swr_init(swrContext);
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't initialize SwrContext: ",
getFFMPEGErrorStringFromErrorCode(status));
streamInfo.swrContext.reset(swrContext);
}

// --------------------------------------------------------------------------
// PTS <-> INDEX CONVERSIONS
// --------------------------------------------------------------------------
Expand Down
12 changes: 12 additions & 0 deletions src/torchcodec/decoders/_core/VideoDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ class VideoDecoder {
FilterGraphContext filterGraphContext;
ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
UniqueSwsContext swsContext;
UniqueSwrContext swrContext;

// Used to know whether a new FilterGraphContext or UniqueSwsContext should
// be created before decoding a new frame.
Expand Down Expand Up @@ -401,6 +402,11 @@ class VideoDecoder {
const AVFrame* avFrame,
torch::Tensor& outputTensor);

UniqueAVFrame convertAudioAVFrameSampleFormat(
const UniqueAVFrame& avFrame,
AVSampleFormat sourceSampleFormat,
AVSampleFormat desiredSampleFormat);

// --------------------------------------------------------------------------
// COLOR CONVERSION LIBRARIES HANDLERS CREATION
// --------------------------------------------------------------------------
Expand All @@ -415,6 +421,12 @@ class VideoDecoder {
const DecodedFrameContext& frameContext,
const enum AVColorSpace colorspace);

void createSwrContext(
StreamInfo& streamInfo,
int sampleRate,
AVSampleFormat sourceSampleFormat,
AVSampleFormat desiredSampleFormat);

// --------------------------------------------------------------------------
// PTS <-> INDEX CONVERSIONS
// --------------------------------------------------------------------------
Expand Down
18 changes: 18 additions & 0 deletions test/decoders/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,3 +1080,21 @@ def test_frame_start_is_not_zero(self):

reference_frames = asset.get_frame_data_by_range(start=0, stop=stop_frame_index)
torch.testing.assert_close(samples.data, reference_frames)

def test_single_channel(self):
asset = SINE_MONO_S32
decoder = AudioDecoder(asset.path)

samples = decoder.get_samples_played_in_range(start_seconds=0, stop_seconds=2)
assert samples.data.shape[0] == asset.num_channels == 1

def test_format_conversion(self):
asset = SINE_MONO_S32
decoder = AudioDecoder(asset.path)
assert decoder.metadata.sample_format == asset.sample_format == "s32"

all_samples = decoder.get_samples_played_in_range(start_seconds=0)
assert all_samples.data.dtype == torch.float32

reference_frames = asset.get_frame_data_by_range(start=0, stop=asset.num_frames)
torch.testing.assert_close(all_samples.data, reference_frames)
Binary file not shown.
3 changes: 3 additions & 0 deletions test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,9 @@ def sample_format(self) -> str:
},
)

# Note that the file itself is s32 sample format, but the reference frames are
# stored as fltp. We can add the s32 original reference frames once we support
# decoding to non-fltp format, but for now we don't need to.
SINE_MONO_S32 = TestAudio(
filename="sine_mono_s32.wav",
default_stream_index=0,
Expand Down
Loading