Skip to content

Allow option to use the swscale library for color conversion instead of filtergraph #205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cpp_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,5 @@ jobs:
Torch_DIR="${TORCH_PATH}/share/cmake/Torch"
cmake .. -DTorch_DIR=$Torch_DIR -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTS=ON -DCMAKE_VERBOSE_MAKEFILE=ON
cmake --build .
ctest
ctest --output-on-failure
popd
177 changes: 131 additions & 46 deletions benchmarks/decoders/benchmark_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from torchcodec.decoders import SimpleVideoDecoder

from torchcodec.decoders._core import (
add_video_stream,
_add_video_stream,
create_from_file,
get_frames_at_indices,
get_json_metadata,
Expand Down Expand Up @@ -86,38 +86,72 @@ def get_consecutive_frames_from_video(self, video_file, numFramesToDecode):
class TVNewAPIDecoderWithBackend(AbstractDecoder):
def __init__(self, backend):
self._backend = backend
self._print_each_iteration_time = False
import torchvision # noqa: F401

self.torchvision = torchvision

def get_frames_from_video(self, video_file, pts_list):
start = timeit.default_timer()
self.torchvision.set_video_backend(self._backend)
reader = self.torchvision.io.VideoReader(video_file, "video")
create_done = timeit.default_timer()
frames = []
for pts in pts_list:
reader.seek(pts)
frame = next(reader)
frames.append(frame["data"].permute(1, 2, 0))
frames_done = timeit.default_timer()
if self._print_each_iteration_time:
del reader
Copy link
Contributor

@scotts scotts Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you intending to time how long it takes to do the deallocations of memory on the C++ side? Note that del reader on the Python side will only decrement a reference counter. You can make an explicit call to the garbage collector (see https://docs.python.org/3/library/gc.html#gc.collect), but that's going to do a full collection of all garbage. If you're trying to time how long it takes to deallocate the objects on the C++ side, I don't know if there's a way to do that reliably from Python.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was for my own debugging/edification and is turned off by default. A private variable controls it.

I can delete it if you want. It's developer-only code -- not user-facing.

Let me know.

I timed the code btw -- when using pytorch to time it, the aggregate profile doesn't change when I call del decoder on it. It's just that if you print the iters, you can see a timing difference once in every few iters.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the benchmark is easier to understand without it, honestly. If we keep it, then we need a comment explaining that we know this would happen anyway when the function exits, and this does not actually cause the GC to happen, but doing this here before the function exists allows us to sometimes see the cost of GC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the del line now

del_done = timeit.default_timer()
create_duration = 1000 * round(create_done - start, 3)
frames_duration = 1000 * round(frames_done - create_done, 3)
del_duration = 1000 * round(del_done - frames_done, 3)
total_duration = 1000 * round(del_done - start, 3)
if self._print_each_iteration_time:
print(
f"TV: {create_duration=} {frames_duration=} {del_duration=} {total_duration=}"
)
return frames

def get_consecutive_frames_from_video(self, video_file, numFramesToDecode):
start = timeit.default_timer()
self.torchvision.set_video_backend(self._backend)
reader = self.torchvision.io.VideoReader(video_file, "video")
create_done = timeit.default_timer()
frames = []
for _ in range(numFramesToDecode):
frame = next(reader)
frames.append(frame["data"].permute(1, 2, 0))
frames_done = timeit.default_timer()
if self._print_each_iteration_time:
del reader
del_done = timeit.default_timer()
create_duration = 1000 * round(create_done - start, 3)
frames_duration = 1000 * round(frames_done - create_done, 3)
del_duration = 1000 * round(del_done - frames_done, 3)
total_duration = 1000 * round(del_done - start, 3)
if self._print_each_iteration_time:
print(
f"TV: consecutive: {create_duration=} {frames_duration=} {del_duration=} {total_duration=} {frames[0].shape=}"
)
return frames


class TorchCodecDecoderNonCompiledWithOptions(AbstractDecoder):
def __init__(self, num_threads=None):
class TorchcodecNonCompiledWithOptions(AbstractDecoder):
def __init__(self, num_threads=None, color_conversion_library=None):
self._print_each_iteration_time = False
self._num_threads = num_threads
self._num_threads = int(num_threads) if num_threads else None
self._color_conversion_library = color_conversion_library

def get_frames_from_video(self, video_file, pts_list):
decoder = create_from_file(video_file)
add_video_stream(decoder, num_threads=self._num_threads)
_add_video_stream(
decoder,
num_threads=self._num_threads,
color_conversion_library=self._color_conversion_library,
)
frames = []
times = []
for pts in pts_list:
Expand All @@ -132,30 +166,54 @@ def get_frames_from_video(self, video_file, pts_list):
return frames

def get_consecutive_frames_from_video(self, video_file, numFramesToDecode):
create_time = timeit.default_timer()
decoder = create_from_file(video_file)
add_video_stream(decoder, num_threads=self._num_threads)
add_stream_time = timeit.default_timer()
_add_video_stream(
decoder,
num_threads=self._num_threads,
color_conversion_library=self._color_conversion_library,
)
frames = []
times = []
frames_time = timeit.default_timer()
for _ in range(numFramesToDecode):
start = timeit.default_timer()
frame = get_next_frame(decoder)
end = timeit.default_timer()
times.append(round(end - start, 3))
frames.append(frame)
del_time = timeit.default_timer()
if self._print_each_iteration_time:
del decoder
done_time = timeit.default_timer()
create_duration = 1000 * round(add_stream_time - create_time, 3)
add_stream_duration = 1000 * round(frames_time - add_stream_time, 3)
frames_duration = 1000 * round(del_time - frames_time, 3)
del_duration = 1000 * round(done_time - del_time, 3)
total_duration = 1000 * round(done_time - create_time, 3)
if self._print_each_iteration_time:
print(
f"{numFramesToDecode=} {create_duration=} {add_stream_duration=} {frames_duration=} {del_duration=} {total_duration=} {frames[0][0].shape=}"
)
print("torchcodec times=", times, sum(times))
return frames


class TorchCodecDecoderNonCompiledBatch(AbstractDecoder):
def __init__(self, num_threads=None):
class TorchCodecNonCompiledBatch(AbstractDecoder):
def __init__(self, num_threads=None, color_conversion_library=None):
self._print_each_iteration_time = False
self._num_threads = num_threads
self._num_threads = int(num_threads) if num_threads else None
self._color_conversion_library = color_conversion_library

def get_frames_from_video(self, video_file, pts_list):
decoder = create_from_file(video_file)
scan_all_streams_to_update_metadata(decoder)
add_video_stream(decoder, num_threads=self._num_threads)
_add_video_stream(
decoder,
num_threads=self._num_threads,
color_conversion_library=self._color_conversion_library,
)
metadata = json.loads(get_json_metadata(decoder))
average_fps = metadata["averageFps"]
best_video_stream = metadata["bestVideoStreamIndex"]
Expand All @@ -169,7 +227,11 @@ def get_frames_from_video(self, video_file, pts_list):
def get_consecutive_frames_from_video(self, video_file, numFramesToDecode):
decoder = create_from_file(video_file)
scan_all_streams_to_update_metadata(decoder)
add_video_stream(decoder, num_threads=self._num_threads)
_add_video_stream(
decoder,
num_threads=self._num_threads,
color_conversion_library=self._color_conversion_library,
)
metadata = json.loads(get_json_metadata(decoder))
best_video_stream = metadata["bestVideoStreamIndex"]
frames = []
Expand All @@ -191,13 +253,13 @@ def compiled_next(decoder):
return get_next_frame(decoder)


class TorchCodecDecoderCompiled(AbstractDecoder):
class TorchcodecCompiled(AbstractDecoder):
def __init__(self):
pass

def get_frames_from_video(self, video_file, pts_list):
decoder = create_from_file(video_file)
add_video_stream(decoder)
_add_video_stream(decoder)
frames = []
for pts in pts_list:
frame = compiled_seek_and_next(decoder, pts)
Expand All @@ -206,7 +268,7 @@ def get_frames_from_video(self, video_file, pts_list):

def get_consecutive_frames_from_video(self, video_file, numFramesToDecode):
decoder = create_from_file(video_file)
add_video_stream(decoder)
_add_video_stream(decoder)
frames = []
for _ in range(numFramesToDecode):
frame = compiled_next(decoder)
Expand Down Expand Up @@ -259,7 +321,7 @@ def get_test_resource_path(filename: str) -> str:

def create_torchcodec_decoder_from_file(video_file):
video_decoder = create_from_file(video_file)
add_video_stream(video_decoder)
_add_video_stream(video_decoder)
get_next_frame(video_decoder)
return video_decoder

Expand Down Expand Up @@ -294,9 +356,13 @@ def main() -> None:
)
parser.add_argument(
"--decoders",
help="Comma-separated list of decoders to benchmark. Choices are torchcodec, torchaudio, torchvision, decord, torchcodec1, torchcodec_compiled. torchcodec1 means torchcodec with num_threads=1. torchcodec_compiled means torch.compiled torchcodec. torchcodec_batch means torchcodec using batch methods.",
help=(
"Comma-separated list of decoders to benchmark. "
"Choices are torchcodec, torchaudio, torchvision, decord, tcoptions:num_threads=1+color_conversion_library=filtergraph, torchcodec_compiled"
"For torchcodec, you can specify options with tcoptions:<plus-separated-options>. "
),
type=str,
default="decord,torchcodec,torchvision,torchaudio,torchcodec1,torchcodec_compiled,torchcodec_batch",
default="decord,tcoptions:,torchvision,torchaudio,torchcodec_compiled,tcoptions:num_threads=1",
)

args = parser.parse_args()
Expand All @@ -306,38 +372,53 @@ def main() -> None:
num_uniform_samples = 10

decoder_dict = {}
if "decord" in decoders:
decoder_dict["DecordNonBatchDecoderAccurateSeek"] = (
DecordNonBatchDecoderAccurateSeek()
)
if "torchcodec" in decoders:
decoder_dict["TorchCodecDecoderNonCompiled"] = (
TorchCodecDecoderNonCompiledWithOptions()
)
if "torchcodec_compiled" in decoders:
decoder_dict["TorchCodecDecoderCompiled"] = TorchCodecDecoderCompiled()
if "torchcodec1" in decoders:
decoder_dict["TCNonCompiled:ffmpeg_thread_count=1"] = (
TorchCodecDecoderNonCompiledWithOptions(num_threads=1)
)
# We don't compare TorchVision's "pyav" backend because it doesn't support
# accurate seeks.
if "torchvision" in decoders:
decoder_dict["TVNewAPIDecoderWithBackendVideoReader"] = (
TVNewAPIDecoderWithBackend("video_reader")
)
if "torchaudio" in decoders:
decoder_dict["TorchAudioDecoder"] = TorchAudioDecoder()
if "torchcodec_batch" in decoders:
decoder_dict["TorchCodecDecoderNonCompiledBatch"] = (
TorchCodecDecoderNonCompiledBatch()
)
for decoder in decoders:
if decoder == "decord":
decoder_dict["DecordNonBatchDecoderAccurateSeek"] = (
DecordNonBatchDecoderAccurateSeek()
)
elif decoder == "torchcodec":
decoder_dict["TorchCodecNonCompiled"] = TorchcodecNonCompiledWithOptions()
elif decoder == "torchcodec_compiled":
decoder_dict["TorchcodecCompiled"] = TorchcodecCompiled()
elif decoder == "torchvision":
decoder_dict["TVNewAPIDecoderWithBackendVideoReader"] = (
# We don't compare TorchVision's "pyav" backend because it doesn't support
# accurate seeks.
TVNewAPIDecoderWithBackend("video_reader")
)
elif decoder == "torchaudio":
decoder_dict["TorchAudioDecoder"] = TorchAudioDecoder()
elif decoder.startswith("tcbatchoptions:"):
options = decoder[len("tcbatchoptions:") :]
kwargs_dict = {}
for item in options.split("+"):
if item.strip() == "":
continue
k, v = item.split("=")
kwargs_dict[k] = v
decoder_dict["TorchCodecNonCompiledBatch:" + options] = (
TorchCodecNonCompiledBatch(**kwargs_dict)
)
elif decoder.startswith("tcoptions:"):
options = decoder[len("tcoptions:") :]
kwargs_dict = {}
for item in options.split("+"):
if item.strip() == "":
continue
k, v = item.split("=")
kwargs_dict[k] = v
decoder_dict["TorchcodecNonCompiled:" + options] = (
TorchcodecNonCompiledWithOptions(**kwargs_dict)
)

decoder_dict["TVNewAPIDecoderWithBackendVideoReader"]

results = []
for decoder_name, decoder in decoder_dict.items():
for video_path in args.bm_video_paths.split(","):
# We only use the SimpleVideoDecoder to get the metadata and get
# the list of PTS values to seek to.
simple_decoder = SimpleVideoDecoder(video_path)
duration = simple_decoder.metadata.duration_seconds
pts_list = [
Expand Down Expand Up @@ -365,7 +446,7 @@ def main() -> None:
min_run_time=args.bm_video_speed_min_run_seconds
)
)
for num_consecutive_nexts in [1, 10, 100]:
for num_consecutive_nexts in [1, 10]:
consecutive_frames_result = benchmark.Timer(
stmt="decoder.get_consecutive_frames_from_video(video_file, consecutive_frames_to_extract)",
globals={
Expand All @@ -392,10 +473,14 @@ def main() -> None:
"create_torchcodec_decoder_from_file": create_torchcodec_decoder_from_file,
},
label=f"video={first_video_path} {metadata_string}",
sub_label="TorchCodecDecoderNonCompiled",
sub_label="TorchcodecNonCompiled",
description="create()+next()",
)
results.append(creation_result.blocked_autorange(min_run_time=10.0))
results.append(
creation_result.blocked_autorange(
min_run_time=2.0,
)
)
compare = benchmark.Compare(results)
compare.print()

Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/decoders/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ else()
libavformat
libavcodec
libavutil
libswscale
)

# Split libavcodec's version string by '.' and convert it to a list
Expand Down
12 changes: 12 additions & 0 deletions src/torchcodec/decoders/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern "C" {
#include <libavutil/opt.h>
#include <libavutil/pixfmt.h>
#include <libavutil/version.h>
#include <libswscale/swscale.h>
}

namespace facebook::torchcodec {
Expand All @@ -38,6 +39,15 @@ struct Deleterp {
}
};

template <typename T, typename R, R (*Fn)(T*)>
struct Deleter {
inline void operator()(T* p) const {
if (p) {
Fn(p);
}
}
};

// Unique pointers for FFMPEG structures.
using UniqueAVFormatContext = std::unique_ptr<
AVFormatContext,
Expand All @@ -57,6 +67,8 @@ using UniqueAVFilterInOut = std::unique_ptr<
Deleterp<AVFilterInOut, void, avfilter_inout_free>>;
using UniqueAVIOContext = std::
unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
using UniqueSwsContext =
std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;

// av_find_best_stream is not const-correct before commit:
// https://github.com/FFmpeg/FFmpeg/commit/46dac8cf3d250184ab4247809bc03f60e14f4c0c
Expand Down
Loading
Loading