Video FFMpeg使用指南 (v1.5)-真武 PPU 云服务(ppu)-阿里云帮助中心

1. 概述

PPU SDK兼容支持Nvidia Video Codec SDK，所以能直接使用FFmpeg的cuvid、nvenc、libnpp这些插件来支持硬件加速，不需要做额外的代码修改。已经验证的FFmpeg版本包括：

FFmpeg 7.0.1

对应nv-codec-headers 12.2.72.0

Video Codec SDK需支持到12.2及以上版本

FFmpeg官方最新版本，大幅优化了transcode的并行化性能。

FFmpeg 6.1.2

依赖nv-codec-headers 12.1.14.0

Video Codec SDK需支持到12.1及以上版本

PyAV等框架依赖这个版本。这也是目前使用最广泛的版本。

2. 自行编译

FFmpeg：https://github.com/FFmpeg/FFmpeg

nv-codec-headers：https://github.com/FFmpeg/nv-codec-headers

重要

请注意FFmpeg版本跟nv-codec-headers版本之间的对应匹配关系。

下文有FFmpeg 7.0和FFmpeg 6.1的编译说明，编译前请确认已经安装PPU SDK并执行了envsetup.sh，配置好SDK的环境。

2.1. 准备ffmpeg code

setup_ffmpeg.sh：

## clone 12.2 版本的 nv-codec-headers 匹配7.0 ffmpeg
git clone --branch n12.2.72.0 --depth 1 https://github.com/FFmpeg/nv-codec-headers
cp nv-codec-headers/ffnvcodec.pc.in nv-codec-headers/ffnvcodec.pc
    
## down ffmpeg
wget https://ffmpeg.org/releases/ffmpeg-7.0.1.tar.gz --no-check-certificate
tar -xzvf ffmpeg-7.0.1.tar.gz

使用dynlink_loader.h替换nv-codec-headers/include/ffnvcodec/目录下的同名文件。

Note：附件dynlink_loader.h实际上就是删除了一些不支持不需要的接口函数：

    // tcuGLGetDevices_v2 *cuGLGetDevices;
    // tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
    // tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
    // tcuGraphicsMapResources *cuGraphicsMapResources;
    // tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
    // tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
    // tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;

    ... ...

    // tcuArrayCreate *cuArrayCreate;
    // tcuArray3DCreate *cuArray3DCreate;
    // tcuArrayDestroy *cuArrayDestroy;

    // tcuEGLStreamProducerConnect *cuEGLStreamProducerConnect;
    // tcuEGLStreamProducerDisconnect *cuEGLStreamProducerDisconnect;
    // tcuEGLStreamConsumerDisconnect *cuEGLStreamConsumerDisconnect;
    // tcuEGLStreamProducerPresentFrame *cuEGLStreamProducerPresentFrame;
    // tcuEGLStreamProducerReturnFrame *cuEGLStreamProducerReturnFrame;

... ...
    // LOAD_SYMBOL(cuDevicePrimaryCtxRelease, tcuDevicePrimaryCtxRelease, "cuDevicePrimaryCtxRelease");
    LOAD_SYMBOL(cuDevicePrimaryCtxRelease, tcuDevicePrimaryCtxRelease, "cuDevicePrimaryCtxRelease_v2");
    // LOAD_SYMBOL(cuDevicePrimaryCtxSetFlags, tcuDevicePrimaryCtxSetFlags, "cuDevicePrimaryCtxSetFlags");
    LOAD_SYMBOL(cuDevicePrimaryCtxSetFlags, tcuDevicePrimaryCtxSetFlags, "cuDevicePrimaryCtxSetFlags_v2");
    
    ... ...
    // LOAD_SYMBOL(cuDevicePrimaryCtxReset, tcuDevicePrimaryCtxReset, "cuDevicePrimaryCtxReset");
    LOAD_SYMBOL(cuDevicePrimaryCtxReset, tcuDevicePrimaryCtxReset, "cuDevicePrimaryCtxReset_v2");

    ... ... 

    // LOAD_SYMBOL(cuLinkCreate, tcuLinkCreate, "cuLinkCreate");
    LOAD_SYMBOL(cuLinkCreate, tcuLinkCreate, "cuLinkCreate_v2");
    // LOAD_SYMBOL(cuLinkAddData, tcuLinkAddData, "cuLinkAddData");
    LOAD_SYMBOL(cuLinkAddData, tcuLinkAddData, "cuLinkAddData_v2");

    ... ... 
        
    // LOAD_SYMBOL(cuModuleGetGlobal, tcuModuleGetGlobal, "cuModuleGetGlobal");
    LOAD_SYMBOL(cuModuleGetGlobal, tcuModuleGetGlobal, "cuModuleGetGlobal_v2");
    
    ... ...

    // LOAD_SYMBOL(cuGLGetDevices, tcuGLGetDevices_v2, "cuGLGetDevices_v2");
    // LOAD_SYMBOL(cuGraphicsGLRegisterImage, tcuGraphicsGLRegisterImage, "cuGraphicsGLRegisterImage");
    // LOAD_SYMBOL(cuGraphicsUnregisterResource, tcuGraphicsUnregisterResource, "cuGraphicsUnregisterResource");
    // LOAD_SYMBOL(cuGraphicsMapResources, tcuGraphicsMapResources, "cuGraphicsMapResources");
    // LOAD_SYMBOL(cuGraphicsUnmapResources, tcuGraphicsUnmapResources, "cuGraphicsUnmapResources");
    // LOAD_SYMBOL(cuGraphicsSubResourceGetMappedArray, tcuGraphicsSubResourceGetMappedArray, "cuGraphicsSubResourceGetMappedArray");
    // LOAD_SYMBOL(cuGraphicsResourceGetMappedPointer, tcuGraphicsResourceGetMappedPointer, "cuGraphicsResourceGetMappedPointer_v2");

    ... ...
    // LOAD_SYMBOL(cuArrayCreate, tcuArrayCreate, "cuArrayCreate_v2");
    // LOAD_SYMBOL(cuArray3DCreate, tcuArray3DCreate, "cuArray3DCreate_v2");
    // LOAD_SYMBOL(cuArrayDestroy, tcuArrayDestroy, "cuArrayDestroy");

    // LOAD_SYMBOL_OPT(cuEGLStreamProducerConnect, tcuEGLStreamProducerConnect, "cuEGLStreamProducerConnect");
    // LOAD_SYMBOL_OPT(cuEGLStreamProducerDisconnect, tcuEGLStreamProducerDisconnect, "cuEGLStreamProducerDisconnect");
    // LOAD_SYMBOL_OPT(cuEGLStreamConsumerDisconnect, tcuEGLStreamConsumerDisconnect, "cuEGLStreamConsumerDisconnect");
    // LOAD_SYMBOL_OPT(cuEGLStreamProducerPresentFrame, tcuEGLStreamProducerPresentFrame, "cuEGLStreamProducerPresentFrame");
    // LOAD_SYMBOL_OPT(cuEGLStreamProducerReturnFrame, tcuEGLStreamProducerReturnFrame, "cuEGLStreamProducerReturnFrame");

2.2. build ffmpeg

build_ffmpeg.sh：

mkdir output
cd ffmpeg-7.0.1
export PKG_CONFIG_PATH=$(pwd)/../nv-codec-headers:$PKG_CONFIG_PATH
./configure --enable-shared \
    --enable-nonfree --enable-gpl \
    --enable-cuvid --enable-nvenc --enable-libnpp \
    --extra-cflags="-I$(pwd)/../nv-codec-headers/include -I$CUDA_HOME/include" \
    --extra-ldflags="-L$CUDA_HOME/lib64" \
    --prefix=$(pwd)/../output
    
make -j16 && make install

说明

--prefix指向编译成功后的安装目录，如果不设置，默认安装到/usr/local目录。

编译安装成功后，在安装目录下会生成bin、include、lib、share。lib目录下有pkgconfig目录，输入pkg-config --modversion libavcodec可以看到版本号。

3. 运行示例

cd ../output
export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH

# ffmpeg decode test with cuvid
bin/ffmpeg -vcodec h264_cuvid -i 640x360_y8c8.h264 -frames 10 -y out-dec-h264.yuv

# ffmpeg transcode test with cuvid and nvenc
bin/ffmpeg -hwaccel_output_format cuda -vcodec h264_cuvid -i 640x360_y8c8.h264 -vcodec hevc_nvenc -r 30.0 -b:v 15000000 -preset p4 output.hevc

# ffmpeg transcode test with cuvid and nvenc, resize with npp support
bin/ffmpeg -hwaccel_output_format cuda -vcodec h264_cuvid -i input.h264 -vcodec hevc_nvenc -frames 300 -preset p7 -acodec copy -y output.h265

4. FAQ

Q. 目前ffmpeg基于PPU可以支持哪些硬件加速能力？

A. 可以支持h264_cuvid，hevc_cuvid，av1_cuvid和vp9_cuvid硬件解码（如果有avs2的需求，可以申请额外补丁包）；可以支持h264_nvenc、hevc_nvenc、av1_nvenc硬件编码；可以支持transpose、color-conversion、scale npp加速。

5. 附录A：ffmpeg6.1的编译

ffmpeg 7.1在transcode/multistream并行优化上做了很大的架构改动，带来了明显的性能提升，但如果因为兼容性的原因需要使用ffmpeg 6.1，则需要下载安装旧的版本。

## clone 12.0 版本的 nv-codec-headers 匹配6.0 ffmpeg
git clone --branch n12.1.14.0 --depth 1 https://github.com/FFmpeg/nv-codec-headers
cp nv-codec-headers/ffnvcodec.pc.in nv-codec-headers/ffnvcodec.pc
    
## down ffmpeg
wget https://ffmpeg.org/releases/ffmpeg-6.1.2.tar.gz --no-check-certificate
tar -xzvf ffmpeg-6.1.2.tar.gz

dynlink_loader.h

使用附件dynlink_loader.h替换nv-codec-headers/include/ffnvcodec/目录下的同名文件，后续编译和运行指令一致。

mkdir output
cd ffmpeg-6.1.2
export PKG_CONFIG_PATH=$(pwd)/../nv-codec-headers:$PKG_CONFIG_PATH
./configure --enable-shared \
    --enable-nonfree --enable-gpl \
    --enable-cuvid --enable-nvenc --enable-libnpp \
    --extra-cflags="-I$(pwd)/../nv-codec-headers/include -I$CUDA_HOME/include" \
    --extra-ldflags="-L$CUDA_HOME/lib64" \
    --prefix=$(pwd)/../output
    
make -j16 && make install

6. 附录B：ffmpeg生态

6.1. ffmpeg-python

官网：https://github.com/kkroening/ffmpeg-python

ffmpeg-python实质是把你输入的参数拼合之后，调用ffmpeg程序来执行，所以需要你的运行环境有安装ffmpeg，或者设置了ffmpeg所在的bin和lib目录。

因为PPU Video Codec只支持cuvid decode，所以只配置hwaccel='cuda'是不行的，需要配置cuvid参数，示例如下

参考示例：

import ffmpeg
import sys

probe = ffmpeg.probe('input.mp4')
video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)

if video_stream is None or video_stream['codec_name'] is None:
    sys.exit()
    
if video_stream['codec_name'] in ['h264', 'hevc', 'av1', 'vp9']:
    out, _ = (
        ffmpeg.input('input.mp4', vcodec = video_stream['codec_name']+'_cuvid')
        .output('output.rgb', format='rawvideo', pix_fmt='rgb24')
        .run()
    )
else:
    out, _ = (
        ffmpeg.input('input.mp4')
        .output('output.rgb', format='rawvideo', pix_fmt='rgb24')
        .run()

ffmpeg-python本质上是ffmpeg指令封装，除了文件之外，也可以基于'pipe:'输入和输出数据（即stdin, stdout），数据在CPU而编解码在GPU，所以不可避免会有数据的拷贝，因此不推荐。

6.2. PyAV

Torchvision支持PyAV，video_reader和cuda这三个backend，默认的backend是PyAV。这其中video_reader是纯CPU编解码，cuda是使用GPU decoder解码。PyAV则会调用FFmpeg来完成编解码。所以PyAV实际是依赖FFmpeg本身支持的硬件加速能力，需要编译安装支持硬件加速的ffmpeg。

说明

到目前为止，PyAV还不支持FFmpeg 7.0，所以需要编译附录A提到的FFmpeg6.1版本。

PyAV官网：https://github.com/PyAV-Org/PyAV

调用路径1：pytorch->torchvision->PyAV->ffmpeg：

硬件解码支持（需要修改torchvision的代码，torchvision/io/video_reader.py）：

--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -280,6 +280,13 @@ class VideoReader:
         if self.backend == "pyav":
             stream_type = stream.split(":")[0]
             stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            video_stream = self.container.streams.video[stream_id]
+            # Setting up the codec with cuvid
+            if video_stream.codec.name in ('h264', 'hevc', 'av1', 'vp9'):
+                codec_name = f'{video_stream.codec.name}_cuvid'
+            else:
+                codec_name = video_stream.codec.name  # Fallback to software decoding
+            video_stream.codec_context = av.codec.CodecContext.create(codec_name, 'r')
             self.pyav_stream = {stream_type: stream_id}
             self._c = self.container.decode(**self.pyav_stream)

PyAV输出的数据默认是TCHW格式。

硬件编码支持（不需要修改torchvision代码，但用户调用时指定使用nvenc名称）：

from torchvision.io import write_video
write_video(save_path, frames, fps=fps, video_codec="h264_nvenc")  # 这里需要明确指定，目前我们支持h264_nvenc，av1_nvenc, hevc_nvenc

调用路径2：pytorch-> torchaudio-> pyAV -> ffmpeg:

Torchaudio也支持pyAV作为backend实现硬件解码，不需要修改torchaudio代码，但需要明确指定cuvid的名称：

import torch
import torchaudio

from torchaudio.io import StreamReader
from torchaudio.utils import ffmpeg_utils

s = StreamReader(src)
s.add_video_stream(int(s.get_src_stream_info(0).frame_rate), decoder="h264_cuvid") # 这里需要明确指定cuvid，目前我们支持h264_cuvid，av1_cuvid, hevc_cuvid, vp9_cuvid
s.fill_buffer()
(video,) = s.pop_chunks()

备注：可以通过 torchvision.get_video_backend()获得当前的backend，可以通过torchvision.set_video_backend("pyav")设置torchvision的默认backend。但默认安装的torchvision不支持“cuda”这个backend，需要手动编译安装torchvision。

6.3. torchcodec

官方文档：https://ai-gerrit.eng.t-head.cn/c/acvid/+/67622

项目repo：https://github.com/pytorch/torchcodec

使能通路：pytorch-> torchcodec -> ffmpeg

torchcodec官方目前还是在开发阶段，等其稳定版本发布后会将其加入SAIL pip源中

torchcodec最新已经发布了其0.2.1 release，SAIL SDK也对其中的三个版本（0.2.1, 0.2.0和0.1.1版本）进行了硬件视频解码加速能力的支持。

torchcodec版本和PyTorch版本的依赖关系：

torchcodec	torch	Python
main / nightly	main / nightly	>=3.9, <=3.13
0.2.1	2.6	>=3.9, <=3.13
0.2.0	2.6	>=3.9, <=3.13
0.1.1	2.5	>=3.9, <=3.12

安装方法：

安装依赖库：FFmpeg (上述FFmpeg6.1和FFmpeg7.0都是可以的)
源码编译并打包安装：

# branch name: release/0.2.1 release/0.2.0 release/0.1.1
git clone --branch <branch_name> https://github.com/pytorch/torchcodec.git
cd torchcodec/
git apply torchcodec_<version>.diff
ENABLE_CUDA=1 I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1 pip install . --no-build-isolation

TorchCodec源码需要做两处改动以支持在PPU上运行：

Decode使用cuvid (SAIL支持cuvid decode, 暂不支持nvdec)
CUDA加速时，sw_scale->scale_npp以使能npp加速（仅0.1.1需要）

torchcodec_0.1.1.diff

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index df8aaae..af1b794 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -312,7 +312,7 @@ void VideoDecoder::initializeFilterGraphForStream(
   }
   const AVFilter* buffersrc = avfilter_get_by_name("buffer");
   const AVFilter* buffersink = avfilter_get_by_name("buffersink");
-  enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_NONE};
+  enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE};
   const StreamInfo& activeStream = streams_[streamIndex];

   AVCodecContext* codecContext = activeStream.codecContext.get();
@@ -341,6 +341,12 @@ void VideoDecoder::initializeFilterGraphForStream(
         std::string("Failed to create filter graph: ") + args + ": " +
         getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
+  if (options.device.type() == torch::kCUDA) {
+    AVBufferSrcParameters *par_in = av_buffersrc_parameters_alloc();
+    par_in->hw_frames_ctx = codecContext->hw_frames_ctx;
+    av_buffersrc_parameters_set(filterState.sourceContext, par_in);
+    av_freep(&par_in);
+  }
   ffmpegStatus = avfilter_graph_create_filter(
       &filterState.sinkContext,
       buffersink,
@@ -383,7 +389,7 @@ void VideoDecoder::initializeFilterGraphForStream(
   std::snprintf(
       description,
       sizeof(description),
-      "scale=%d:%d:sws_flags=bilinear",
+      options.device.type() == torch::kCUDA ? "scale_npp=%d:%d:interp_algo=linear" : "scale=%d:%d:sws_flags=bilinear",
       width,
       height);
   AVFilterInOut* outputsTmp = outputs.release();
@@ -436,6 +442,15 @@ void VideoDecoder::addVideoStreamDecoder(
   if (streamNumber < 0) {
     throw std::invalid_argument("No valid stream found in input file.");
   }
+  if (options.device.type() == torch::kCUDA) {
+    const char* cuvid_suffix = "_cuvid";
+    size_t cuvid_length = std::strlen(codec->name) + std::strlen(cuvid_suffix) + 1;
+    char* cuvid_name = new char[cuvid_length];
+    std::strcpy(cuvid_name, codec->name);
+    std::strcat(cuvid_name, cuvid_suffix);
+    codec = avcodec_find_decoder_by_name(cuvid_name);
+    delete[] cuvid_name;
+  }
   TORCH_CHECK(codec != nullptr);
   StreamInfo& streamInfo = streams_[streamNumber];
   streamInfo.streamIndex = streamNumber;
@@ -464,6 +479,12 @@ void VideoDecoder::addVideoStreamDecoder(
   if (retVal < AVSUCCESS) {
     throw std::invalid_argument(getFFMPEGErrorStringFromErrorCode(retVal));
   }
+  if (options.device.type() == torch::kCUDA) {
+    codecContext->hw_frames_ctx = av_hwframe_ctx_alloc(codecContext->hw_device_ctx);
+    AVHWFramesContext* hwframe_ctx = (AVHWFramesContext*)codecContext->hw_frames_ctx->data;
+    // in avcodec_open2, cuvid will not set sw_format until sequence callback in avcodec_send_packet()
+    hwframe_ctx->sw_format = codecContext->sw_pix_fmt;
+  }
   codecContext->time_base = streamInfo.stream->time_base;
   activeStreamIndices_.insert(streamNumber);
   updateMetadataWithCodecContext(streamInfo.streamIndex, codecContext);

torchcodec_0.2.0.diff

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 4085e90..91856c3 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -465,10 +465,14 @@ void VideoDecoder::addVideoStreamDecoder(
   }

   if (videoStreamOptions.device.type() == torch::kCUDA) {
-    avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        findCudaCodec(
-            videoStreamOptions.device, streamInfo.stream->codecpar->codec_id)
-            .value_or(avCodec));
+    const char* cuvid_suffix = "_cuvid";
+    size_t cuvid_length = std::strlen(avCodec->name) + std::strlen(cuvid_suffix) + 1;
+    char* cuvid_name = new char[cuvid_length];
+    std::strcpy(cuvid_name, avCodec->name);
+    std::strcat(cuvid_name, cuvid_suffix);
+    avCodec = avcodec_find_decoder_by_name(cuvid_name);
+    delete[] cuvid_name;
+    TORCH_CHECK(avCodec != nullptr);
   }
 
   StreamMetadata& streamMetadata =
@@ -503,6 +507,12 @@ void VideoDecoder::addVideoStreamDecoder(
     throw std::invalid_argument(getFFMPEGErrorStringFromErrorCode(retVal));
   }

+  if (videoStreamOptions.device.type() == torch::kCUDA) {
+    codecContext->hw_frames_ctx = av_hwframe_ctx_alloc(codecContext->hw_device_ctx);
+    AVHWFramesContext* hwframe_ctx = (AVHWFramesContext*)codecContext->hw_frames_ctx->data;
+    // in avcodec_open2, cuvid will not set sw_format until sequence callback in avcodec_send_packet()
+    hwframe_ctx->sw_format = codecContext->sw_pix_fmt;
+  }
   codecContext->time_base = streamInfo.stream->time_base;
   activeStreamIndex_ = streamIndex;
   updateMetadataWithCodecContext(streamInfo.streamIndex, codecContext);

torchcodec_0.2.1.diff

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 97214ce..530541d 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -449,9 +449,14 @@ void VideoDecoder::addStream(
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
-    avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        findCudaCodec(device, streamInfo.stream->codecpar->codec_id)
-            .value_or(avCodec));
+    const char* cuvid_suffix = "_cuvid";
+    size_t cuvid_length = std::strlen(avCodec->name) + std::strlen(cuvid_suffix) + 1;
+    char* cuvid_name = new char[cuvid_length];
+    std::strcpy(cuvid_name, avCodec->name);
+    std::strcat(cuvid_name, cuvid_suffix);
+    avCodec = avcodec_find_decoder_by_name(cuvid_name);
+    delete[] cuvid_name;
+    TORCH_CHECK(avCodec != nullptr);
   }

   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
@@ -474,6 +479,12 @@ void VideoDecoder::addStream(
     throw std::invalid_argument(getFFMPEGErrorStringFromErrorCode(retVal));
   }

+  if (device.type() == torch::kCUDA) {
+    codecContext->hw_frames_ctx = av_hwframe_ctx_alloc(codecContext->hw_device_ctx);
+    AVHWFramesContext* hwframe_ctx = (AVHWFramesContext*)codecContext->hw_frames_ctx->data;
+    // in avcodec_open2, cuvid will not set sw_format until sequence callback in avcodec_send_packet()
+    hwframe_ctx->sw_format = codecContext->sw_pix_fmt;
+  }
   codecContext->time_base = streamInfo.stream->time_base;
   containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
       std::string(avcodec_get_name(codecContext->codec_id));

使用示例：

python sample_torchcodec.py

import torch

print(f"{torch.__version__=}")
print(f"{torch.cuda.is_available()=}")
print(f"{torch.cuda.get_device_properties(0)=}")

import torchcodec
from torchcodec.decoders import VideoDecoder

decoder = VideoDecoder("sample.mp4", device="cuda")
frame = decoder[0]

6.4. imageio-ffmpeg

imageio-ffmpeg是imageio的一个插件，它提供对FFmpeg的封装。跟ffmpeg-python一样，其工作实质是把输入的参数拼合之后，调用ffmpeg程序来执行，所以需要你的运行环境有安装ffmpeg，或者设置了ffmpeg所在的bin和lib目录。不建议使用，官方明确表示：You should probably use PyAV instead; it is faster and offers more features。

imageio-ffmpeg: https://github.com/imageio/imageio-ffmpeg

使用示例：

import imageio

video_reader = imageio.get_reader('input.mp4', 'ffmpeg', ffmpeg_params=['-c:v', 'h264_cuvid'])
frame_list = []
for i, frame in enumerate(video_reader):
    frame_list.append(frame)
imageio.mimsave('out.mp4', frame_list)

6.5. Decord

默认安装的Decord是CPU版本，不支持硬件加速，需要手动编译安装。

Decord依赖FFmpeg，但只是使用demux能力，不需要额外编译支持硬件加速的FFmpeg版本。

因为ffmpeg老版本有一个问题会导致Decord死锁，这个问题在ffmpeg7才被解决，所以请编译安装FFmpeg 7.0以上的版本。

下载Decord：

git clone -b v0.6.0 --depth 1 --recursive https://github.com/dmlc/decord
cd decord
git apply decord_ffmpeg7.patch

patch内容主要是把Decord对FFmpeg的依赖从4.2改成7.0以上：

decord_ffmpeg7.patch：

+++ b/src/video/nvcodec/cuda_threaded_decoder.cc
@@ -17,7 +17,7 @@ namespace decord {
 namespace cuda {
 using namespace runtime;

-CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat)
+CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat)
     : device_id_(device_id), stream_({device_id, false}), device_{}, ctx_{}, parser_{}, decoder_{},
     pkt_queue_{}, frame_queue_{},
     run_(false), frame_count_(0), draining_(false),
@@ -70,7 +70,7 @@ CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar,
     }
 }

-void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat) {
+void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat) {
     const char* bsf_name = nullptr;
     if (AV_CODEC_ID_H264 == codecpar->codec_id) {
         // H.264
diff --git a/src/video/nvcodec/cuda_threaded_decoder.h b/src/video/nvcodec/cuda_threaded_decoder.h
index d7e6fcd..61958a1 100644
--- a/src/video/nvcodec/cuda_threaded_decoder.h
+++ b/src/video/nvcodec/cuda_threaded_decoder.h
@@ -46,7 +46,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
     using FrameOrderQueuePtr = std::unique_ptr<FrameOrderQueue>;

     public:
-        CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat);
+        CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat);
         void SetCodecContext(AVCodecContext *dec_ctx, int width = -1, int height = -1, int rotation = 0);
         bool Initialized() const;
         void Start();
@@ -70,7 +70,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
         void LaunchThreadImpl();
         void RecordInternalError(std::string message);
         void CheckErrorStatus();
-        void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat);
+        void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat);

         int device_id_;
         CUStream stream_;
diff --git a/src/video/video_reader.cc b/src/video/video_reader.cc
index af4858d..99c9635 100644
--- a/src/video/video_reader.cc
+++ b/src/video/video_reader.cc
@@ -145,7 +145,7 @@ VideoReader::~VideoReader(){

 void VideoReader::SetVideoStream(int stream_nb) {
     if (!fmt_ctx_) return;
-    AVCodec *dec;
+    const AVCodec *dec;
     int st_nb = av_find_best_stream(fmt_ctx_.get(), AVMEDIA_TYPE_VIDEO, stream_nb, -1, &dec, 0);
     // LOG(INFO) << "find best stream: " << st_nb;
     CHECK_GE(st_nb, 0) << "ERROR cannot find video stream with wanted index: " << stream_nb;

编译Decord：

mkdir build && cd build
cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release
make

安装Decord:

cd ../python
python3 setup.py install --user

使用示例：

import decord
import torch
from decord import gpu, cpu

video_path="input.mp4"
vr = decord.VideoReader(video_path, ctx=gpu(0))
nframes = 40
total_frames = 500
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
video = vr.get_batch(idx).asnumpy()