在PPU上编译Paddle

更新时间:
复制为 MD 格式

本文为您介绍在PPU上编译Paddle的过程,您可以参考此文,并根据自己的镜像环境和软件版本进行编译。

Paddle简介

PaddlePaddle (Parallel Distributed Deep Learning并行分布式深度学习) 是百度研发的深度学习平台,具有易用、高效、灵活和可伸缩等特点,为百度内部多项产品提供深度学习算法支持。

编译NVIDIA Docker

参考Paddle文档,按照百度Paddle官网提供的方式,拉取对应的docker,正常配置后即可顺利完成编译。

nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:latest-gpu-cuda10.2-cudnn7-dev
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
docker run --name paddle-test -v $PWD:/paddle --network=host -it registry.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
#安装依赖
pip3.7 install protobuf
apt update
apt install bzip2 make
wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz
tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz
rm cmake-3.16.0-Linux-x86_64.tar.gz
export PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
#切换分支
cd /paddle
git checkout release/2.3               #(实际使用的release/2.3,2.4版本thirdparty库无法拉取成功)
mkdir -p /paddle/build && cd /paddle/build
cmake .. -DPY_VERSION=3.7 -DWITH_GPU=ON
make -j$(nproc)

相关要求(官网提供)

  1. gcc >= 5.4

  2. python >= 3.6

  3. cmake >= 3.16

  4. os centos >= 7 or ubuntu >= 16

编译Paddle过程记录

1. 拉取代码,建立docker

虽然Paddle官网要求gcc版本>= 5.4即可,但是真正用gcc 5.4会遇到非常多的兼容问题,因此,建议选择的gcc版本 >= 8.2(官方docker中的gcc版本为8.2),本文选用的是holmes团队建立的holmes镜像,gcc版本为9.4。

说明

此处以阿里云内部镜像为例进行操作说明(此镜像外网无法拉取),您可以针对不同的软件,及所需的环境选取不同的ACS容器镜像进行操作,ACS镜像请参见ACS容器镜像版本发布记录

cd /mnt/work
git clone https://github.com/PaddlePaddle/Paddle.git
docker run -it  --name wenze002 --shm-size=8g -it --pid=host --privileged=true -v /mnt/work:/mnt/work -v /aisw/:/aisw/ --device /dev/dri/renderD129:/dev/dri/renderD129 reg.docker.alibaba-inc.com/alinpu_ee/holmes-build:latest /bin/bash

2. 拉取SDK

终端中执行如下命令拉取SDK。

wget http://11.160.46.32:9000/artifactory/list/apackage/daily/ppu/latest/PPU_SDK_CUDA/PPU_SDK_cuda-11.0.3-ubuntu1604-simt.tar.gz
tar -xzvf PPU_SDK_cuda-11.0.3-ubuntu1604-simt.tar.gz
source ./PPU_SDK/envsetup.sh
cd /mnt/work/Paddle
mkdir build && cd build

这里SDK由于一处nvcc_wrapper处理问题,会出现-Xcompiler ,"-Wno-error=deprecated-declarations","-Wno-deprecated-declarations","-O2","-fopenmp","-fPIC","-O3","-DNDEBUG" 这种一个选项后带多个option的情况,现在还不支持,需要9728的支持,目前等待merge中。因此这里需要带9728 patch生成的SDK。

3. thrust库寻找

function(add_thrust_patches_if_necessary)
  set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu)
  file(WRITE ${thrust_detect_file} ""
    "#include \"thrust/version.h\"\n"
    "#include \"thrust/shuffle.h\"\n"
    "#include \"stdio.h\"\n"
    "int main() {\n"
    "  int version = THRUST_VERSION;\n"
    "  printf(\"%d\", version);\n"
    "  return 0;\n"
    "}\n")

  execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
                  "--run" "${thrust_detect_file}"
                  WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                  RESULT_VARIABLE nvcc_res ERROR_QUIET)
  if(NOT nvcc_res EQUAL 0)
    set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust")
    message(STATUS "Add thrust patches: ${thrust_patches}")
    include_directories(${thrust_patches})
  endif()
endfunction()

add_thrust_patches_if_necessary()

这里thrust.cmake检测相关的代码会出现实际输出有打印100909,但是nvcc_res的值是0的现象,这里直接采用将1721行注释的解决办法。

4. CUDA_ARCH检测失败

cmake/cuda.cmake中有对CUDA_ARCH的检测

######################################################################################
# A function for automatic detection of GPUs installed  (if autodetection is enabled)
# Usage:
#   detect_installed_gpus(out_variable)
function(detect_installed_gpus out_variable)
  if(NOT CUDA_gpu_detect_output)
    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)

    file(WRITE ${cufile} ""
      "#include \"stdio.h\"\n"
      "#include \"cuda.h\"\n"
      "#include \"cuda_runtime.h\"\n"
      "int main() {\n"
      "  int count = 0;\n"
      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
      "  if (count == 0) return -1;\n"
      "  for (int device = 0; device < count; ++device) {\n"
      "    cudaDeviceProp prop;\n"
      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
      "      printf(\"%d.%d \", prop.major, prop.minor);\n"
      "  }\n"
      "  return 0;\n"
      "}\n")

    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
                    "--run" "${cufile}"
                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(nvcc_res EQUAL 0)
      # only keep the last line of nvcc_out
      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
      list(GET nvcc_out -1 nvcc_out)
      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
    endif()
  endif()

  if(NOT CUDA_gpu_detect_output)
    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
  else()
    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
  endif()
endfunction()

但是在实际的执行中,找不到CUDA_ARCH

-- WARNING: This is just a warning for publishing release.
      You are building GPU version without supporting different architectures.
      So the wheel package may fail on other GPU architectures.
      You can add -DCUDA_ARCH_NAME=All in cmake command
      to get a full wheel package to resolve this warning.
      While, this version will still work on local GPU architecture.
-- Automatic GPU detection failed. Building for all known architectures.
-- NVCC_FLAGS_EXTRA:  -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86

这里采用的是指定ARCH的方式进行解决。

5 修改nccl/cuDNN等路径

如果按照上边cuda docker的方式进行cmake .. -DPY_VERSION=3.8 -DWITH_GPU=ON,会发现实际上找到的都是错误的路径,cmake的打印中会出现

-- Check for working CUDA compiler: /mnt/work/PPU_SDK/CUDA_SDK/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Current NCCL header is /usr/include/nccl.h. Current NCCL version is v21210. 
-- Current cuDNN header is /usr/include/cudnn_version.h Current cuDNN version is v8.3.2. 
-- CUDA detected: 11.6.1
-- WARNING: This is just a warning for publishing release.
      You are building GPU version without supporting different architectures.
      So the wheel package may fail on other GPU architectures.
      You can add -DCUDA_ARCH_NAME=All in cmake command
      to get a full wheel package to resolve this warning.
      While, this version will still work on local GPU architecture.
-- Automatic GPU detection failed. Building for all known architectures.
-- NVCC_FLAGS_EXTRA:  -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86
NVCC args: --run /mnt/work/Paddle/build/detect_thrust.cu
nvcc info     :Processing Option: --run
nvcc info     :Processing Option: /mnt/work/Paddle/build/detect_thrust.cu

这是因为holmes提供的docker镜像并不够纯净,Cmake实际检测的是系统的nccl/thrust/cudnn路径,而PaddleCMakeLists,以cmake/cudnn.cmake为例:

if(WIN32)
    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
else(WIN32)
    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
endif(WIN32)

find_path(CUDNN_INCLUDE_DIR cudnn.h
    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
    NO_DEFAULT_PATH
)

除非指定CUDNN_ROOT,否则只会到/usr/include目录下寻找。这里采用指定路径的方式解决。

6. 避免麻烦的CMake简化

这里大致可分为4类对cmake文件的修改:

  1. 由于代码质量不高,为了尽量少的修改代码,关闭部分error信息(低版本会报warning)。

    diff --git a/CMakeLists.txt b/CMakeLists.txt
    index c4286292b0..f7b6081bb4 100755
    --- a/CMakeLists.txt
    +++ b/CMakeLists.txt
    @@ -80,6 +80,9 @@ endif()
    
     if(WITH_GPU AND NOT APPLE)
       enable_language(CUDA)
    +  set(CMAKE_CXX_FLAGS
    +      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy"
    +  )
       message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
                      "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
     endif()
  2. 不想采用cmake -D指定的方式,因此手动关闭部分非必须支持功能。

    diff --git a/CMakeLists.txt b/CMakeLists.txt
    index c4286292b0..f7b6081bb4 100755
    --- a/CMakeLists.txt
    +++ b/CMakeLists.txt
    @@ -252,7 +255,7 @@ option(WITH_LIBXSMM "Compile with libxsmm" OFF)
     option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
     option(WITH_PSLIB "Compile with pslib support" OFF)
     option(WITH_BOX_PS "Compile with box_ps support" OFF)
    -option(WITH_XBYAK "Compile with xbyak support" ON)
    +option(WITH_XBYAK "Compile with xbyak support" OFF)
     option(WITH_CONTRIB "Compile the third-party contributation" OFF)
     option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
     option(WITH_HETERPS "Compile with heterps" OFF})
    @@ -267,11 +270,11 @@ option(
     option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
     option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
     option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
    -option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
    -option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
    +option(WITH_NCCL "Compile PaddlePaddle with NCCL support" OFF)
    +option(WITH_RCCL "Compile PaddlePaddle with RCCL support" OFF)
     option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
     option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF)
    -option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
    +option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" OFF)
     option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
     option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
     option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
    @@ -286,7 +289,7 @@ option(NEW_RELEASE_ALL
     option(NEW_RELEASE_JIT
            "PaddlePaddle next-level release strategy for backup jit package" OFF)
     option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
    -option(WITH_POCKETFFT "Compile with pocketfft support" ON)
    +option(WITH_POCKETFFT "Compile with pocketfft support" OFF)
     option(WITH_RECORD_BUILDTIME
            "Compile PaddlePaddle with record all targets build time" OFF)
     option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
  3. 3. thrust库寻找中对thrust库的修改。

    diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake
    index ff415b1e3c..49b9e40ed7 100644
    --- a/cmake/thrust.cmake
    +++ b/cmake/thrust.cmake
    @@ -14,11 +14,11 @@ function(add_thrust_patches_if_necessary)
                       "--run" "${thrust_detect_file}"
                       WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                       RESULT_VARIABLE nvcc_res ERROR_QUIET)
    -  if(NOT nvcc_res EQUAL 0)
    +  #if(NOT nvcc_res EQUAL 0)
         set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust")
         message(STATUS "Add thrust patches: ${thrust_patches}")
         include_directories(${thrust_patches})
    -  endif()
    +  #endif()
     endfunction()
  4. 引入iomp5库的正确性保证。

    diff --git a/cmake/generic.cmake b/cmake/generic.cmake
    index ba59eae392..1090cbd63a 100644
    --- a/cmake/generic.cmake
    +++ b/cmake/generic.cmake
    @@ -342,7 +342,7 @@ function(cc_library TARGET_NAME)
             if(WIN32)
               target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
             else(WIN32)
    -          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
    +          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -lpthread -liomp5 -Wl,--as-needed")
             endif(WIN32)
           endif()

    这里使用了iomp5的库,该库是IntelMKL引入的一个omp库,iomp5库依赖于pthread,如果在链接后的文件中用 ldd命令发现没有pthreadso信息,或者pthreadso文件在iomp5文件的后面,都会报错。此处为保证pthead一定在iomp5前边,在link时强行指定顺序,用-lpthread -liomp5来保证。

7. cmake生成编译命令

cmake .. -DPY_VERSION=3.8 -DWITH_GPU=ON -DCUDA_TOOLKIT_ROOT_DIR="/mnt/work/PPU_SDK/CUDA_SDK" -DCUDA_ARCH_NAME="Ampere" -DCUDNN_INCLUDE_DIR="/mnt/work/PPU_SDK/CUDA_SDK/targets/x86_64-linux/include" -DNCCL_INCLUDE_DIR="/mnt/work/PPU_SDK/CUDA_SDK/targets/x86_64-linux/include" -DCMAKE_CUDA_ARCHITECTURES=80

8. 第一次编译尝试

make -j 32

很快会遇到fatal error:'omp.h' file not found的问题,此处是因为omp.h文件在gcc自有的文件路径下,对于当前没有开启host_split的情况下,默认是找不到的,同时又不能让gcc的路径覆盖clang的路径,因此解决的方式是修改PPU_SDK/CUDA_SDK/bin/nvcc_parser文件。

nvcc_parser修改:

if [ "$OUTPUT_FILE" = '' ]; then
BUILD_CMD="${HGGC} -I ${CUDA_SDK_ROOT}/targets/x86_64-linux/include $HGGC_FLAGS $INPUT_FILE ${LINK_LIBS} $INCLUDE_DIRS $DEFINITIONS -idirafter /usr/lib/gcc/x86_64-linux-gnu/9/include; ${LLD_CMD}"
else
BUILD_CMD="${HGGC} -I ${CUDA_SDK_ROOT}/targets/x86_64-linux/include $HGGC_FLAGS $INPUT_FILE ${LINK_LIBS} -idirafter /usr/lib/gcc/x86_64-linux-gnu/9/include -o $OUTPUT_FILE $INCLUDE_DIRS $DEFINITIONS ; ${LLD_CMD}"
fi

9. 第二次编译尝试

解决上一步的问题后,继续编译,又遇到了error: dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.的问题

root@b62adca0f784:/mnt/work/Paddle/build/paddle/fluid/platform# /mnt/work/PPU_SDK/CUDA_SDK/bin/clang++ -isystem /mnt/work/PPU_SDK/CUDA_SDK/targets/x86_64-linux/include -x hggc --compatible-mode -fcuda-prefix -fPIC $PPU_OPTION --forward-unknown-to-host-compiler -w -D__CUDACC_EXTENDED_LAMBDA__  -Wall  -Wextra  -Werror  -fPIC  -fno-omit-frame-pointer  -Wno-unused-parameter  -Wno-unused-function -O3 -MD -MT=paddle/fluid/platform/CMakeFiles/profiler.dir/profiler.cu.o -MF CMakeFiles/profiler.dir/profiler.cu.o.d -c  /mnt/work/Paddle/paddle/fluid/platform/profiler.cu  -o CMakeFiles/profiler.dir/profiler.cu.o  -I/mnt/work/Paddle/build -I/mnt/work/Paddle/paddle/fluid/framework/io -I/mnt/work/Paddle/patches/thrust -I/mnt/work/Paddle/build/third_party/install/zlib/include -I/mnt/work/Paddle/build/third_party/install -I/mnt/work/Paddle/build/third_party/install/gflags/include -I/mnt/work/Paddle/build/third_party/install/glog/include -I/mnt/work/Paddle/build/third_party/boost/src/extern_boost -I/mnt/work/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/mnt/work/Paddle/build/third_party/threadpool/src/extern_threadpool -I/mnt/work/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/mnt/work/Paddle/build/third_party/install/xxhash/include -I/mnt/work/Paddle/build/third_party/install/warpctc/include -I/mnt/work/Paddle/build/third_party/install/utf8proc/include -I/mnt/work/Paddle/build/third_party/install/mklml/include -I/mnt/work/Paddle/build/third_party/install/mkldnn/include -I/mnt/work/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.8 -I/usr/local/lib/python3.8/dist-packages/numpy/core/include -I/mnt/work/Paddle/build/third_party/pybind/src/extern_pybind/include -I/mnt/work/Paddle/build/third_party/install/gloo/include -I/mnt/work/Paddle/build/third_party/install/xbyak/include -I/mnt/work/Paddle/build/third_party/install/xbyak/include/xbyak -I/mnt/work/Paddle/build/third_party/install/cryptopp/include -I/mnt/work/Paddle/build/third_party/pocketfft/src -I/mnt/work/PPU_SDK/CUDA_SDK/extras/CUPTI/include -I/mnt/work/PPU_SDK/CUDA_SDK/targets/x86_64-linux/include -I/mnt/work/PPU_SDK/CUDA_SDK/include -I/mnt/work/Paddle -D__CUDACC__ -D__CUDA__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=1 -DUSE_HGGC -D__NVCC__  -DCUDA_TOOLKIT_ROOT_DIR=/mnt/work/PPU_SDK/CUDA_SDK -DCUDA_VERSION_MAJOR=11 -DCUDA_VERSION_MINOR=1 -DCUDNN_MAJOR_VERSION=8 -DEIGEN_USE_GPU -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_CUDA -DPADDLE_WITH_CUPTI -DPADDLE_WITH_CUSTOM_DEVICE -DPADDLE_WITH_MKLDNN -DPADDLE_WITH_MKLML -DPADDLE_WITH_NCCL -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_XBYAK -DTRT_PLUGIN_FP16_AVALIABLE -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__ -DNDEBUG -Xarch_device -D__CUDA_ARCH__=800 -I/usr/lib/gcc/x86_64-linux-gnu/9/include
In file included from /mnt/work/Paddle/paddle/fluid/platform/profiler.cu:23:
In file included from /mnt/work/Paddle/paddle/fluid/platform/profiler.h:33:
In file included from /mnt/work/Paddle/paddle/fluid/platform/profiler/supplement_tracing.h:19:
In file included from /mnt/work/Paddle/paddle/fluid/framework/shape_inference.h:22:
In file included from /mnt/work/Paddle/paddle/fluid/framework/variable.h:21:
In file included from /mnt/work/Paddle/paddle/fluid/framework/selected_rows_utils.h:24:
In file included from /mnt/work/Paddle/paddle/phi/core/selected_rows.h:24:
In file included from /mnt/work/Paddle/paddle/phi/core/selected_rows_impl.h:26:
In file included from /mnt/work/Paddle/paddle/phi/core/dense_tensor.h:20:
In file included from /mnt/work/Paddle/paddle/phi/core/tensor_base.h:23:
/mnt/work/Paddle/paddle/phi/core/utils/type_registry.h:81:40: error: dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.
const TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
^
/mnt/work/Paddle/paddle/phi/core/tensor_base.h:79:57: note: in instantiation of static data member 'phi::TypeInfo<phi::TensorBase>::kUnknownType' requested here
TypeInfo<TensorBase> type_info_{TypeInfo<TensorBase>::kUnknownType};
^
1 error generated when compiling for .

由于只是一个const属性的问题,可以通过去掉该属性解决

paddle/phi/core/utils/type_registry.h

diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h
    index f27c3db227..afacaa0437 100644
    --- a/paddle/phi/core/utils/type_registry.h
    +++ b/paddle/phi/core/utils/type_registry.h
    @@ -78,7 +78,7 @@ const std::string& TypeInfo<BaseT>::name() const {
}

template <typename BaseT>
-const TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
+ TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
RegisterStaticType<BaseT>("Unknown");

}  // namespace phi

paddle/phi/core/utils/type_info.h

diff --git a/paddle/phi/core/utils/type_info.h b/paddle/phi/core/utils/type_info.h
    index 33a4e09933..e114cc0866 100644
    --- a/paddle/phi/core/utils/type_info.h
    +++ b/paddle/phi/core/utils/type_info.h
    @@ -31,7 +31,7 @@ class TypeInfo {
bool operator==(TypeInfo other) const { return id_ == other.id(); }
bool operator!=(TypeInfo other) const { return id_ != other.id(); }

-  static const TypeInfo kUnknownType;
+  static  TypeInfo kUnknownType;

private:
friend class TypeRegistry<BaseT>;

10. 第三次编译尝试

修改代码后,继续编译,遇到如下问题:

In file included from /mnt/work/Paddle/paddle/fluid/operators/fused/fused_feedforward_op.cu:21:
In file included from /mnt/work/Paddle/paddle/fluid/operators/fused/fused_dropout_helper.h:20:
/mnt/work/Paddle/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h:243:11: error: expected a for, while, or do-while loop to follow '#pragma unroll'
          mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
          ^

发现在一个不是for/while/do-while的地方使用了#pragma unroll,修改paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h代码

diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
    index d53a24a57e..297abf9856 100644
    --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
    +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
    @@ -239,7 +239,6 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
RandVec<VecSize>(&state, rand);
#pragma unroll
for (int jt = 0; jt < VecSize; jt++) {
    -#pragma unroll
        mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
}
}

11. 第四次编译尝试

修改代码后,继续尝试,遇到如下问题:

/mnt/work/Paddle/paddle/fluid/operators/math/cross_entropy.cu:32:5: error: no matching constructor for initialization of 'phi::ErrorSummary'
    PADDLE_ENFORCE(lbl >= 0 && lbl < D || lbl == ignore_index,
    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/mnt/work/Paddle/paddle/phi/core/enforce.h:402:32: note: expanded from macro 'PADDLE_ENFORCE'
      __THROW_ERROR_INTERNAL__(phi::ErrorSummary(__VA_ARGS__)); \
      ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/mnt/work/Paddle/paddle/phi/core/enforce.h:297:41: note: expanded from macro '__THROW_ERROR_INTERNAL__'
    throw ::phi::enforce::EnforceNotMet(__ERROR_SUMMARY, __FILE__, __LINE__); \
                                        ^~~~~~~~~~~~~~~
/mnt/work/Paddle/paddle/phi/core/errors.h:100:12: note: candidate constructor not viable: call to __host__ function from __global__ function
  explicit ErrorSummary(Args... args) {
           ^
/mnt/work/Paddle/paddle/phi/core/errors.h:107:12: note: candidate constructor not viable: requires 2 arguments, but 5 were provided
  explicit ErrorSummary(ErrorCode code, std::string msg)

查看paddle/fluid/operators/math/cross_entropy.cu源码

template <typename T, typename LabelT>
__global__ void CrossEntropyKernel(T* Y, const T* X, const LabelT* label,
const int N, const int D,
const int ignore_index) {
    CUDA_KERNEL_LOOP(i, N) {
        auto lbl = static_cast<int64_t>(label[i]);
        PADDLE_ENFORCE(lbl >= 0 && lbl < D || lbl == ignore_index,
            "The value of label[%d] expected >= 0 and < %ld, or == %ld, "
            "but got %ld. Please check input value.",
            i, D, ignore_index, lbl);
        Y[i] = ignore_index == lbl
            ? static_cast<T>(0)
            : -math::TolerableValue<T>()(real_log(X[i * D + lbl]));
    }
}

而其中PADDLE_ENFORCE宏的实现在paddle/phi/core/enforce.h

#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
do {                                                             \
if (!(_IS_NOT_ERROR)) {                                        \
printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
__FILE__,                                             \
__LINE__,                                             \
#_IS_NOT_ERROR,                                       \
##__VA_ARGS__);                                       \
asm("trap;");                                                \
}                                                              \
} while (0)
#elif defined(__HIPCC__)
#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
do {                                                             \
if (!(_IS_NOT_ERROR)) {                                        \
printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
__FILE__,                                             \
__LINE__,                                             \
#_IS_NOT_ERROR,                                       \
##__VA_ARGS__);                                       \
abort();                                                     \
}                                                              \
} while (0)
#else
#define PADDLE_ENFORCE(COND, ...)                               \
do {                                                          \
auto __cond__ = (COND);                                     \
if (UNLIKELY(::phi::is_error(__cond__))) {                  \
__THROW_ERROR_INTERNAL__(phi::ErrorSummary(__VA_ARGS__)); \
}                                                           \
} while (0)
#endif

这里报错是在kernel内用到了host throw相关的内容,由于这里边主要是一些printf和代码检查,对于打通整体kernel没那么重要,因此可以通过注释kernel内相关宏的方法来解决,此处列出这个文件的修改,这个宏的使用非常普遍,问题需要修复。

paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu

diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
    index 4c9fb14842..6b47a65ab5 100644
        --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
        +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
        @@ -41,12 +41,12 @@ __global__ void MaskLabelByIndex(T* predicted_logits, const T* logit,
const int64_t D, const int nranks) {
        CUDA_KERNEL_LOOP(i, N) {
            auto real_label = label[i];
            -    PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
                -                   "The index is out of bounds, "
                -                   "please check whether the value of label and "
                -                   "input meet the class number. It should "
                -                   "be less than [%d], but received [%d]",
                -                   D * nranks, real_label);
            +    // PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
                +    //                "The index is out of bounds, "
                +    //                "please check whether the value of label and "
                +    //                "input meet the class number. It should "
                +    //                "be less than [%d], but received [%d]",
                +    //                D * nranks, real_label);

12. 第五次编译尝试

通过注释相关PADDLE_ENFORCE的使用,继续进行整体编译,很快遇到新的问题

cd /mnt/work/Paddle/build/paddle/fluid/pybind && /usr/local/bin/cmake -E copy /mnt/work/Paddle/build/third_party/install/mkldnn/libmkldnn.so.0 /mnt/work/Paddle/build/paddle/fluid/pybind
[ 57%] copy_if_different /mnt/work/Paddle/paddle/fluid/pybind/op_function_impl.h.tmp to /mnt/work/Paddle/paddle/fluid/pybind/op_function_impl.h
cd /mnt/work/Paddle/build/paddle/fluid/pybind && /usr/local/bin/cmake -E env LD_LIBRARY_PATH=/mnt/work/PPU_SDK/CUDA_SDK/lib64:/mnt/work/PPU_SDK/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:. /mnt/work/Paddle/build/paddle/fluid/pybind/op_function_generator /mnt/work/Paddle/paddle/fluid/pybind/op_function_impl.h.tmp
free(): invalid pointer
Subprocess aborted

实际执行时,都没有进入main函数中,想通过gdb来查找错误信息非常困难。查看相关的ldd信息,都是符合预期的,并没有发现什么问题。

ldd op_function_generator.check
linux-vdso.so.1 (0x00007ffe646e8000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fa4ce5a0000)
libiomp5.so => /mnt/work/Paddle/build/third_party/install/mklml/lib/libiomp5.so (0x00007fa4ce1ab000)
libdnnl.so.2 => /mnt/work/Paddle/build/third_party/install/mkldnn/lib/libdnnl.so.2 (0x00007fa4cbe4e000)
libcudart.so.11.0 => /mnt/work/PPU_SDK/CUDA_SDK/lib64/libcudart.so.11.0 (0x00007fa4cbc2d000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fa4cbc25000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fa4cba43000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fa4cb8f4000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fa4cb8d9000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fa4cb6e7000)
/lib64/ld-linux-x86-64.so.2 (0x00007fa4e9132000)
libgomp.so.1 => /lib/x86_64-linux-gnu/libgomp.so.1 (0x00007fa4cb6a5000)

查看真正的link命令

link.txt

/usr/bin/c++  -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -fopenmp -mavx -O3 -DNDEBUG CMakeFiles/op_function_generator.dir/op_function_generator.cc.o -o op_function_generator   -L/mnt/work/PPU_SDK/CUDA_SDK/lib64  -L/usr/local/cuda/lib64/stubs  -Wl,-rpath,/mnt/work/Paddle/build/third_party/install/mkldnn/lib:/mnt/work/Paddle/build/third_party/install/mklml/lib -lpthread ../../../third_party/install/glog/lib/libglog.a ../framework/libproto_desc.a ../framework/libexecutor.a ../imperative/liblayer.a ../imperative/libtracer.a ../imperative/libengine.a ../imperative/libimperative_profiler.a ../imperative/libimperative_flag.a ../operators/librecurrent_op.a ../operators/libeye_op.a ../operators/liblstm_op.a ../operators/libspectral_op.a ../operators/libsync_batch_norm_op.a ../operators/libwarpctc_op.a ../operators/libload_combine_op.a ../operators/libsave_combine_op.a ../operators/libquantize_linear_op.a ../operators/librun_program_op.a ../operators/libwhere_op.a ../operators/libwhere_index_op.a ../operators/libviterbi_decode_op.a ../operators/libvar_conv_2d_op.a ../operators/libunstack_op.a ../operators/libunsqueeze_op.a ../operators/libunpool_op.a ../operators/libunique_with_counts_op.a ../operators/libunique_op.a ../operators/libunique_consecutive_op.a ../operators/libuniform_random_op.a ../operators/libuniform_random_inplace_op.a ../operators/libuniform_random_batch_size_like_op.a ../operators/libunfold_op.a ../operators/libunbind_op.a ../operators/libtruncated_gaussian_random_op.a ../operators/libtrunc_op.a ../operators/libtril_triu_op.a ../operators/libtriangular_solve_op.a ../operators/libtree_conv_op.a ../operators/libtranspose_op.a ../operators/libtransfer_layout_op.a ../operators/libtrace_op.a ../operators/libtop_k_v2_op.a ../operators/libtop_k_op.a ../operators/libtile_op.a ../operators/libtensor_array_to_tensor_op.a ../operators/libtemporal_shift_op.a ../operators/libteacher_student_sigmoid_loss_op.a ../operators/libtdm_sampler_op.a ../operators/libtdm_child_op.a ../operators/libtake_along_axis_op.a ../operators/libsvd_op.a ../operators/libsum_op.a ../operators/libstrided_slice_op.a ../operators/libstft_op.a ../operators/libstack_op.a ../operators/libsqueeze_op.a ../operators/libsquared_l2_norm_op.a ../operators/libsquared_l2_distance_op.a ../operators/libspp_op.a ../operators/libsplit_op.a ../operators/libsplit_lod_tensor_op.a ../operators/libspectral_norm_op.a ../operators/libspace_to_depth_op.a ../operators/libsolve_op.a ../operators/libsoftmax_with_cross_entropy_op.a ../operators/libsoftmax_op.a ../operators/libsmooth_l1_loss_op.a ../operators/libslice_op.a ../operators/libsize_op.a ../operators/libsimilarity_focus_op.a ../operators/libsign_op.a ../operators/libsigmoid_cross_entropy_with_logits_op.a ../operators/libshuffle_channel_op.a ../operators/libshuffle_batch_op.a ../operators/libshrink_rnn_memory_op.a ../operators/libshare_data_op.a ../operators/libshare_buffer_op.a ../operators/libshard_index_op.a ../operators/libshape_op.a ../operators/libset_value_op.a ../operators/libselu_op.a ../operators/libselect_output_op.a ../operators/libselect_input_op.a ../operators/libsegment_pool_op.a ../operators/libseed_op.a ../operators/libsearchsorted_op.a ../operators/libscatter_op.a ../operators/libscatter_nd_add_op.a ../operators/libscale_op.a ../operators/libsave_op.a ../operators/libsampling_id_op.a ../operators/libsample_logits_op.a ../operators/librow_conv_op.a ../operators/libroll_op.a ../operators/libroi_pool_op.a ../operators/libroi_align_op.a ../operators/librnn_op.a ../operators/librnn_memory_helper_op.a ../operators/libreverse_op.a ../operators/libreshape_op.a ../operators/librequantize_op.a ../operators/librepeat_interleave_op.a ../operators/libreorder_lod_tensor_by_rank_op.a ../operators/librenorm_op.a ../operators/libreal_op.a ../operators/libread_file_op.a ../operators/librank_loss_op.a ../operators/librank_attention_op.a ../operators/librange_op.a ../operators/librandperm_op.a ../operators/librandom_routing_op.a ../operators/librandom_crop_op.a ../operators/librandint_op.a ../operators/libqueue_generator_op.a ../operators/libquantize_op.a ../operators/libqr_op.a ../operators/libpyramid_hash_op.a ../operators/libput_along_axis_op.a ../operators/libpush_dense_op.a ../operators/libpull_sparse_v2_op.a ../operators/libpull_sparse_op.a ../operators/libpull_gpups_sparse_op.a ../operators/libpull_box_sparse_op.a ../operators/libpull_box_extended_sparse_op.a ../operators/libpsroi_pool_op.a ../operators/libprune_gate_by_capacity_op.a ../operators/libprroi_pool_op.a ../operators/libprint_op.a ../operators/libprelu_op.a ../operators/libpositive_negative_pair_op.a ../operators/libpool_with_index_op.a ../operators/libpool_op.a ../operators/libpoisson_op.a ../operators/libpixel_shuffle_op.a ../operators/libpartial_sum_op.a ../operators/libpartial_concat_op.a ../operators/libpad_op.a ../operators/libpad_constant_like_op.a ../operators/libpad3d_op.a ../operators/libpad2d_op.a ../operators/libp_norm_op.a ../operators/liboverlap_add_op.a ../operators/libone_hot_v2_op.a ../operators/libone_hot_op.a ../operators/libnumber_count_op.a ../operators/libnorm_op.a ../operators/libnop_op.a ../operators/libnll_loss_op.a ../operators/libnce_op.a ../operators/libmv_op.a ../operators/libmultiplex_op.a ../operators/libmultinomial_op.a ../operators/libmulti_dot_op.a ../operators/libmul_op.a ../operators/libmodified_huber_loss_op.a ../operators/libmode_op.a ../operators/libminus_op.a ../operators/libmeshgrid_op.a ../operators/libmerge_selected_rows_op.a ../operators/libmerge_lod_tensor_op.a ../operators/libmemcpy_op.a ../operators/libmemcpy_h2d_op.a ../operators/libmemcpy_d2h_op.a ../operators/libmean_op.a ../operators/libmean_iou_op.a ../operators/libmaxout_op.a ../operators/libmax_sequence_len_op.a ../operators/libmatrix_rank_op.a ../operators/libmatrix_power_op.a ../operators/libmatmul_v2_op.a ../operators/libmatmul_op.a ../operators/libmatch_matrix_tensor_op.a ../operators/libmasked_select_op.a ../operators/libmarker_op.a ../operators/libmargin_rank_loss_op.a ../operators/libmargin_cross_entropy_op.a ../operators/liblu_unpack_op.a ../operators/liblu_op.a ../operators/liblstsq_op.a ../operators/liblstmp_op.a ../operators/liblstm_unit_op.a ../operators/liblrn_op.a ../operators/liblookup_table_v2_op.a ../operators/liblookup_table_op.a ../operators/liblookup_table_dequant_op.a ../operators/liblog_softmax_op.a ../operators/liblog_loss_op.a ../operators/liblod_tensor_to_array_op.a ../operators/liblod_reset_op.a ../operators/liblod_rank_table_op.a ../operators/liblod_array_length_op.a ../operators/libload_op.a ../operators/liblinspace_op.a ../operators/liblinear_chain_crf_op.a ../operators/liblimit_by_capacity_op.a ../operators/liblgamma_op.a ../operators/liblerp_op.a ../operators/liblayer_norm_op.a ../operators/liblabel_smooth_op.a ../operators/libl1_norm_op.a ../operators/libkthvalue_op.a ../operators/libkron_op.a ../operators/libkldiv_loss_op.a ../operators/libisfinite_v2_op.a ../operators/libisfinite_op.a ../operators/libisclose_op.a ../operators/libis_empty_op.a ../operators/libinverse_op.a ../operators/libinterpolate_v2_op.a ../operators/libinterpolate_op.a ../operators/libinstance_norm_op.a ../operators/libinplace_abn_op.a ../operators/libindex_select_op.a ../operators/libindex_sample_op.a ../operators/libincrement_op.a ../operators/libimag_op.a ../operators/libim2sequence_op.a ../operators/libhuber_loss_op.a ../operators/libhistogram_op.a ../operators/libhinge_loss_op.a ../operators/libhierarchical_sigmoid_op.a ../operators/libhash_op.a ../operators/libgumbel_softmax_op.a ../operators/libgru_unit_op.a ../operators/libgru_op.a ../operators/libgroup_norm_op.a ../operators/libgrid_sampler_op.a ../operators/libgraph_send_recv_op.a ../operators/libgraph_sample_neighbors_op.a ../operators/libgraph_reindex_op.a ../operators/libgraph_khop_sampler_op.a ../operators/libget_tensor_from_selected_rows_op.a ../operators/libgelu_op.a ../operators/libgaussian_random_op.a ../operators/libgaussian_random_batch_size_like_op.a ../operators/libgather_tree_op.a ../operators/libgather_op.a ../operators/libgather_nd_op.a ../operators/libfused_softmax_mask_upper_triangle_op.a ../operators/libfused_softmax_mask_op.a ../operators/libfsp_op.a ../operators/libframe_op.a ../operators/libfold_op.a ../operators/libflip_op.a ../operators/libflatten_op.a ../operators/libfilter_by_instag_op.a ../operators/libfill_zeros_like_op.a ../operators/libfill_op.a ../operators/libfill_diagonal_tensor_op.a ../operators/libfill_diagonal_op.a ../operators/libfill_constant_op.a ../operators/libfill_constant_batch_size_like_op.a ../operators/libfill_any_op.a ../operators/libfill_any_like_op.a ../operators/libfc_op.a ../operators/libfake_quantize_op.a ../operators/libfake_dequantize_op.a ../operators/libexponential_op.a ../operators/libexpand_v2_op.a ../operators/libexpand_op.a ../operators/libexpand_as_v2_op.a ../operators/libexpand_as_op.a ../operators/liberfinv_op.a ../operators/liberf_op.a ../operators/libenqueue_op.a ../operators/libempty_op.a ../operators/libeinsum_op.a ../operators/libeigvalsh_op.a ../operators/libeigvals_op.a ../operators/libeigh_op.a ../operators/libeig_op.a ../operators/libedit_distance_op.a ../operators/libdropout_op.a ../operators/libdot_op.a ../operators/libdist_op.a ../operators/libdirichlet_op.a ../operators/libdigamma_op.a ../operators/libdiagonal_op.a ../operators/libdiag_v2_op.a ../operators/libdiag_op.a ../operators/libdiag_embed_op.a ../operators/libdgc_clip_by_norm_op.a ../operators/libdeterminant_op.a ../operators/libdetection_map_op.a ../operators/libdequeue_op.a ../operators/libdequantize_op.a ../operators/libdequantize_log_op.a ../operators/libdequantize_abs_max_op.a ../operators/libdelete_var_op.a ../operators/libdeformable_psroi_pooling_op.a ../operators/libdeformable_conv_v1_op.a ../operators/libdeformable_conv_op.a ../operators/libdecode_jpeg_op.a ../operators/libdata_norm_op.a ../operators/libcvm_op.a ../operators/libcumsum_op.a ../operators/libcumprod_op.a ../operators/libcudnn_lstm_op.a ../operators/libctc_align_op.a ../operators/libcross_op.a ../operators/libcross_entropy_op.a ../operators/libcrop_tensor_op.a ../operators/libcrop_op.a ../operators/libcrf_decoding_op.a ../operators/libcos_sim_op.a ../operators/libcorrelation_op.a ../operators/libcopy_cross_scope_op.a ../operators/libconv_transpose_op.a ../operators/libconv_shift_op.a ../operators/libconv_op.a ../operators/libconj_op.a ../operators/libconcat_op.a ../operators/libcomplex_view_op.a ../operators/libcomplex_op.a ../operators/libcoalesce_tensor_op.a ../operators/libclip_op.a ../operators/libclip_by_norm_op.a ../operators/libclass_center_sample_op.a ../operators/libchunk_eval_op.a ../operators/libcholesky_solve_op.a ../operators/libcholesky_op.a ../operators/libcenter_loss_op.a ../operators/libcast_op.a ../operators/libbroadcast_tensors_op.a ../operators/libbpr_loss_op.a ../operators/libbmm_op.a ../operators/libbincount_op.a ../operators/libbilinear_tensor_product_op.a ../operators/libbilateral_slice_op.a ../operators/libbernoulli_op.a ../operators/libbeam_search_op.a ../operators/libbeam_search_decode_op.a ../operators/libbce_loss_op.a ../operators/libbatch_norm_op.a ../operators/libbatch_fc_op.a ../operators/libaverage_accumulates_op.a ../operators/libattention_lstm_op.a ../operators/libatan2_op.a ../operators/libassign_value_op.a ../operators/libassign_pos_op.a ../operators/libassign_op.a ../operators/libassert_op.a ../operators/libascend_trigger_op.a ../operators/libarray_to_lod_tensor_op.a ../operators/libargsort_op.a ../operators/libarg_min_op.a ../operators/libarg_max_op.a ../operators/libangle_op.a ../operators/liballclose_op.a ../operators/libaffine_grid_op.a ../operators/libaffine_channel_op.a ../operators/libaddmm_op.a ../operators/libadd_position_encoding_op.a ../operators/libactivation_op.a ../operators/libabs_op.a ../operators/reader/libread_op.a ../operators/reader/libcreate_py_reader_op.a ../operators/reader/libcreate_double_buffer_reader_op.a ../operators/amp/libupdate_loss_scaling_op.a ../operators/amp/libget_float_status_op.a ../operators/amp/libclear_float_status_op.a ../operators/amp/libcheck_finite_and_unscale_op.a ../operators/amp/liballoc_float_status_op.a ../operators/string/libfaster_tokenizer_op.a ../operators/sequence_ops/libsequence_unpad_op.a ../operators/sequence_ops/libsequence_topk_avg_pooling_op.a ../operators/sequence_ops/libsequence_softmax_op.a ../operators/sequence_ops/libsequence_slice_op.a ../operators/sequence_ops/libsequence_scatter_op.a ../operators/sequence_ops/libsequence_reverse_op.a ../operators/sequence_ops/libsequence_reshape_op.a ../operators/sequence_ops/libsequence_pool_op.a ../operators/sequence_ops/libsequence_pad_op.a ../operators/sequence_ops/libsequence_mask_op.a ../operators/sequence_ops/libsequence_expand_op.a ../operators/sequence_ops/libsequence_expand_as_op.a ../operators/sequence_ops/libsequence_erase_op.a ../operators/sequence_ops/libsequence_enumerate_op.a ../operators/sequence_ops/libsequence_conv_op.a ../operators/sequence_ops/libsequence_concat_op.a ../operators/reduce_ops/libreduce_sum_op.a ../operators/reduce_ops/libreduce_prod_op.a ../operators/reduce_ops/libreduce_min_op.a ../operators/reduce_ops/libreduce_mean_op.a ../operators/reduce_ops/libreduce_max_op.a ../operators/reduce_ops/libreduce_any_op.a ../operators/reduce_ops/libreduce_amin_op.a ../operators/reduce_ops/libreduce_amax_op.a ../operators/reduce_ops/libreduce_all_op.a ../operators/reduce_ops/liblogsumexp_op.a ../operators/reduce_ops/libfrobenius_norm_op.a ../operators/optimizers/libsparse_momentum_op.a ../operators/optimizers/libsgd_op.a ../operators/optimizers/librmsprop_op.a ../operators/optimizers/libproximal_gd_op.a ../operators/optimizers/libproximal_adagrad_op.a ../operators/optimizers/libpow2_decay_with_linear_warmup_op.a ../operators/optimizers/libmomentum_op.a ../operators/optimizers/libmerged_momentum_op.a ../operators/optimizers/libmerged_adam_op.a ../operators/optimizers/liblars_momentum_op.a ../operators/optimizers/liblamb_op.a ../operators/optimizers/libftrl_op.a ../operators/optimizers/libdpsgd_op.a ../operators/optimizers/libdistributed_fused_lamb_op.a ../operators/optimizers/libdistributed_fused_lamb_init_op.a ../operators/optimizers/libdgc_momentum_op.a ../operators/optimizers/libdecayed_adagrad_op.a ../operators/optimizers/libadamw_op.a ../operators/optimizers/libadamax_op.a ../operators/optimizers/libadam_op.a ../operators/optimizers/libadagrad_op.a ../operators/optimizers/libadadelta_op.a ../operators/metrics/libprecision_recall_op.a ../operators/metrics/libauc_op.a ../operators/metrics/libaccuracy_op.a ../operators/fused/libresnet_unit_op.a ../operators/fused/libfused_multi_transformer_op.a ../operators/fused/libfused_attention_op.a ../operators/fused/libfused_feedforward_op.a ../operators/fused/libfused_bn_add_activation_op.a ../operators/fused/libfusion_group_op.a ../operators/fused/libfused_embedding_eltwise_layernorm_op.a ../operators/fused/libskip_layernorm_op.a ../operators/fused/libmultihead_matmul_op.a ../operators/fused/libfused_fc_elementwise_layernorm_op.a ../operators/fused/libfusion_conv_inception_op.a ../operators/fused/libfusion_transpose_flatten_concat_op.a ../operators/fused/libconv_fusion_op.a ../operators/fused/libfused_bn_activation_op.a ../operators/fused/libfusion_lstm_op.a ../operators/fused/libfusion_gru_op.a ../operators/fused/libmulti_gru_op.a ../operators/fused/libfusion_squared_mat_sub_op.a ../operators/fused/libfusion_seqpool_cvm_concat_op.a ../operators/fused/libfusion_seqpool_concat_op.a ../operators/fused/libfusion_seqexpand_concat_fc_op.a ../operators/fused/libfusion_seqconv_eltadd_relu_op.a ../operators/fused/libfusion_repeated_fc_relu_op.a ../operators/fused/libfused_seqpool_cvm_op.a ../operators/fused/libfused_embedding_seq_pool_op.a ../operators/fused/libfused_embedding_fc_lstm_op.a ../operators/fused/libfused_elemwise_activation_op.a ../operators/elementwise/libelementwise_sub_op.a ../operators/elementwise/libelementwise_pow_op.a ../operators/elementwise/libelementwise_mul_op.a ../operators/elementwise/libelementwise_mod_op.a ../operators/elementwise/libelementwise_min_op.a ../operators/elementwise/libelementwise_max_op.a ../operators/elementwise/libelementwise_floordiv_op.a ../operators/elementwise/libelementwise_div_op.a ../operators/elementwise/libelementwise_add_op.a ../operators/detection/libgenerate_mask_labels_op.a ../operators/detection/libroi_perspective_transform_op.a ../operators/detection/libcollect_fpn_proposals_op.a ../operators/detection/libdistribute_fpn_proposals_op.a ../operators/detection/libgenerate_proposals_v2_op.a ../operators/detection/libgenerate_proposals_op.a ../operators/detection/libnms_op.a ../operators/detection/libretinanet_detection_output_op.a ../operators/detection/libsigmoid_focal_loss_op.a ../operators/detection/libbox_decoder_and_assign_op.a ../operators/detection/libyolo_box_op.a ../operators/detection/libyolov3_loss_op.a ../operators/detection/libbox_clip_op.a ../operators/detection/libmatrix_nms_op.a ../operators/detection/liblocality_aware_nms_op.a ../operators/detection/libmulticlass_nms_op.a ../operators/detection/libgenerate_proposal_labels_op.a ../operators/detection/librpn_target_assign_op.a ../operators/detection/libpolygon_box_transform_op.a ../operators/detection/libtarget_assign_op.a ../operators/detection/libanchor_generator_op.a ../operators/detection/libmine_hard_examples_op.a ../operators/detection/libbipartite_match_op.a ../operators/detection/libprior_box_op.a ../operators/detection/libiou_similarity_op.a ../operators/detection/libdensity_prior_box_op.a ../operators/detection/libbox_coder_op.a ../operators/controlflow/libwhile_op.a ../operators/controlflow/libtensor_array_read_write_op.a ../operators/controlflow/liblogical_op.a ../operators/controlflow/libget_places_op.a ../operators/controlflow/libfetch_v2_op.a ../operators/controlflow/libfetch_op.a ../operators/controlflow/libfeed_op.a ../operators/controlflow/libdepend_op.a ../operators/controlflow/libconditional_block_infer_op.a ../operators/controlflow/libcompare_op.a ../operators/controlflow/libcompare_all_op.a ../operators/controlflow/libbitwise_op.a ../../../third_party/install/xxhash/lib/libxxhash.a ../framework/libexecutor.a ../../phi/libphi.a ../../phi/api/lib/utils/libphi_api_utils.a ../operators/libgather_scatter_kernel.a ../operators/math/libselected_rows_functor.a ../framework/libselected_rows_utils.a ../../phi/kernels/funcs/lapack/liblapack_function.a ../framework/liblod_tensor.a ../operators/math/libmaxouting.a ../operators/math/libunpooling.a ../../phi/kernels/funcs/libpooling.a ../framework/liblod_rank_table.a ../operators/math/libcontext_project.a ../operators/math/libsequence_pooling.a ../framework/libexecutor.a ../platform/libdevice_memory_aligment.a ../framework/libgenerator.a ../platform/dynload/libdynload_warpctc.a ../operators/math/libsequence_padding.a ../operators/math/libsequence_scale.a ../operators/math/libcos_sim_functor.a ../memory/libmemory.a ../operators/jit/libjit_kernel_helper.a ../operators/math/libconcat_and_split.a ../operators/math/libcross_entropy.a ../operators/math/libsoftmax.a ../operators/math/libvol2col.a ../operators/math/libim2col.a ../operators/math/libsampler.a ../operators/math/libsample_prob.a ../operators/math/libtree2col.a ../../phi/kernels/funcs/libsequence2batch.a ../../phi/kernels/funcs/liblstm_compute.a ../operators/math/libmatrix_bit_code.a ../../phi/kernels/funcs/libgru_compute.a ../../phi/kernels/funcs/detail/libactivation_functions.a ../operators/math/libbeam_search.a ../operators/math/libfc.a ../../phi/kernels/funcs/libmatrix_inverse.a ../operators/math/libmatrix_solve.a ../framework/fleet/libbox_wrapper.a ../framework/fleet/libps_gpu_wrapper.a ../operators/libcommon_infer_shape_functions.a ../../phi/kernels/funcs/eigen/libeigen_function.a ../operators/math/libdepthwise_conv.a ../operators/math/libprelu.a ../operators/math/libbert_encoder_functor.a ../platform/libdevice_memory_aligment.a ../imperative/liblayer.a ../operators/libtensor_formatter.a ../framework/libop_version_registry.a ../framework/libexecutor_cache.a ../framework/libparallel_executor.a ../framework/details/libscope_buffered_ssa_graph_executor.a ../framework/details/libscope_buffered_monitor.a ../framework/details/libparallel_ssa_graph_executor.a ../framework/details/libasync_ssa_graph_executor.a ../framework/details/libthreaded_ssa_graph_executor.a ../framework/details/libbuild_strategy.a ../framework/ir/libpass_builder.a ../framework/ir/libgraph_viz_pass.a ../framework/ir/multi_devices_graph_pass/libmulti_devices_graph_print_pass.a ../framework/ir/multi_devices_graph_pass/libmulti_devices_graph_check_pass.a ../framework/ir/libfuse_elewise_add_act_pass.a ../framework/ir/libfuse_bn_act_pass.a ../framework/ir/libfuse_bn_add_act_pass.a ../framework/ir/libmulti_batch_merge_pass.a ../framework/ir/libfuse_relu_depthwise_conv_pass.a ../framework/ir/liblock_free_optimize_pass.a ../framework/ir/multi_devices_graph_pass/libsequential_execution_pass.a ../framework/ir/multi_devices_graph_pass/liball_reduce_deps_pass.a ../framework/ir/multi_devices_graph_pass/libadd_reader_dependency_pass.a ../framework/ir/multi_devices_graph_pass/libmodify_op_lock_and_record_event_pass.a ../framework/ir/libcoalesce_grad_tensor_pass.a ../framework/ir/multi_devices_graph_pass/libfuse_all_reduce_op_pass.a ../framework/ir/multi_devices_graph_pass/libbackward_optimizer_op_deps_pass.a ../framework/ir/fuse_optimizer_ops_pass/libfuse_adam_op_pass.a ../framework/ir/fuse_optimizer_ops_pass/libfuse_sgd_op_pass.a ../framework/ir/fuse_optimizer_ops_pass/libfuse_momentum_op_pass.a ../framework/ir/fuse_optimizer_ops_pass/libfuse_optimizer_op_pass.a ../framework/ir/libsync_batch_norm_pass.a ../framework/ir/libruntime_context_cache_pass.a ../framework/ir/multi_devices_graph_pass/libfix_op_run_order_pass.a ../framework/ir/libfuse_gemm_epilogue_pass.a ../framework/ir/fusion_group/libfusion_group_pass.a ../framework/ir/fusion_group/libcode_generator.a ../framework/ir/libsubgraph_detector.a ../framework/ir/libmkldnn_placement_pass.a ../framework/ir/libplacement_pass_base.a ../framework/details/libbind_threaded_ssa_graph_executor.a ../framework/details/libfetch_op_handle.a ../framework/details/libfast_threaded_ssa_graph_executor.a ../framework/details/libssa_graph_executor.a ../framework/ir/memory_optimize_pass/libreference_count_pass.a ../framework/ir/memory_optimize_pass/libop_graph_view.a ../framework/ir/memory_optimize_pass/libeager_deletion_pass.a ../framework/details/libeager_deletion_op_handle.a ../framework/ir/memory_optimize_pass/libconditional_block_op_eager_deletion_pass.a ../framework/ir/memory_optimize_pass/libwhile_op_eager_deletion_pass.a ../framework/ir/memory_optimize_pass/librecurrent_op_eager_deletion_pass.a ../framework/ir/memory_optimize_pass/libbuffer_shared_inplace_op_pass.a ../framework/ir/memory_optimize_pass/libbuffer_shared_cross_op_memory_reuse_pass.a ../framework/ir/memory_optimize_pass/libinplace_addto_op_pass.a ../framework/ir/memory_optimize_pass/libmemory_reuse_pass.a ../framework/ir/memory_optimize_pass/libreference_count_pass_helper.a ../framework/details/libshare_tensor_buffer_op_handle.a ../framework/details/libshare_tensor_buffer_functor.a ../framework/ir/multi_devices_graph_pass/libset_reader_device_info_utils.a ../framework/ir/multi_devices_graph_pass/libmulti_devices_graph_pass.a ../framework/details/libmulti_devices_helper.a ../framework/details/libcomputation_op_handle.a ../framework/details/libscale_loss_grad_op_handle.a ../framework/details/librpc_op_handle.a ../framework/details/libfetch_barrier_op_handle.a ../framework/details/libgrad_merge_all_reduce_op_handle.a ../framework/details/liball_reduce_op_handle.a ../framework/details/libfused_all_reduce_op_handle.a ../platform/libdevice_memory_aligment.a ../framework/details/libreduce_op_handle.a ../framework/details/libfused_broadcast_op_handle.a ../framework/details/libbroadcast_op_handle.a ../framework/details/libvariable_visitor.a ../framework/details/libfetch_async_op_handle.a ../framework/details/libop_handle_base.a ../framework/details/libvar_handle.a ../operators/reader/libpy_reader.a ../operators/reader/libbuffered_reader.a ../operators/reader/libreader_op_registry.a ../framework/libreader.a ../framework/libstring_array.a ../platform/libdevice_code.a ../operators/detection/libmask_util.a ../operators/detection/libgpc.a ../framework/libnaive_executor.a ../framework/libexecutor.a ../operators/librecurrent_op.a ../distributed/fleet_executor/libfleet_executor.a ../framework/libexecutor_gc_helper.a ../operators/controlflow/librecurrent_op_helper.a ../operators/controlflow/libconditional_block_op_helper.a ../operators/controlflow/libconditional_block_op.a ../framework/libexecutor.a ../operators/librecurrent_op.a ../distributed/fleet_executor/libfleet_executor.a ../framework/libexecutor_gc_helper.a ../operators/controlflow/librecurrent_op_helper.a ../operators/controlflow/libconditional_block_op_helper.a ../operators/controlflow/libconditional_block_op.a ../framework/liblod_rank_table.a ../framework/fleet/libbox_wrapper.a ../framework/fleet/libps_gpu_wrapper.a ../framework/fleet/libgloo_wrapper.a ../framework/io/libfs.a ../framework/io/libshell.a ../../utils/string/libstring_helper.a ../framework/fleet/libfleet_wrapper.a ../framework/fleet/libheter_wrapper.a ../framework/libheter_service_proto.a ../platform/liblodtensor_printer.a ../framework/libfeed_fetch_method.a ../framework/ir/libgraph_to_program_pass.a ../framework/libop_version_registry.a ../framework/libop_version_proto.a ../framework/ir/libfuse_pass_base.a ../framework/ir/libop_compat_sensible_pass.a ../framework/ir/libgraph_pattern_detector.a ../framework/ir/libgraph_traits.a ../framework/ir/libpass.a ../framework/ir/libgraph_helper.a ../framework/ir/libgraph.a ../../utils/string/libpretty_log.a ../framework/ir/libnode.a ../framework/libop_def_api.a ../framework/libop_def_proto.a ../platform/libtimer.a ../imperative/liblayer.a ../imperative/libtracer.a ../imperative/libengine.a ../imperative/libprepared_operator.a ../imperative/libvar_helper.a ../../phi/api/libphi_api.a ../imperative/jit/libprogram_desc_tracer.a ../imperative/libamp.a ../imperative/libgradient_accumulator.a ../../phi/api/lib/libphi_bw_function_api.a ../imperative/jit/libop_desc_meta.a ../eager/api/utils/libglobal_utils.a ../imperative/liblayer.a ../imperative/libtracer.a ../imperative/libengine.a ../imperative/libprepared_operator.a ../imperative/libvar_helper.a ../../phi/api/libphi_api.a ../imperative/jit/libprogram_desc_tracer.a ../imperative/libamp.a ../imperative/libgradient_accumulator.a ../../phi/api/lib/libphi_bw_function_api.a ../imperative/jit/libop_desc_meta.a ../eager/api/utils/libglobal_utils.a ../imperative/libimperative_flag.a ../framework/libvariable_helper.a ../platform/libdenormal.a ../imperative/liblayout_autotune.a ../../phi/api/lib/libsparse_bw_api.a ../../phi/api/lib/libphi_tensor.a ../../phi/api/lib/libsparse_api.a ../../phi/api/lib/libsparse_api_custom_impl.a ../../phi/api/lib/libstrings_api.a ../../phi/api/lib/libphi_function_api.a ../../phi/api/lib/libapi_custom_impl.a ../../phi/api/lib/libphi_data_transform.a ../../phi/infermeta/libbackward_infermeta.a ../operators/libcommon_infer_shape_functions.a ../distributed/fleet_executor/libfleet_executor_desc_proto.a ../distributed/fleet_executor/libinterceptor_message_proto.a ../distributed/fleet_executor/libtask_loop_thread_pool.a ../platform/libcollective_helper.a ../framework/libop_registry.a ../operators/controlflow/libwhile_op_helper.a ../framework/libgarbage_collector.a ../operators/controlflow/libop_variant.a ../framework/libproto_desc.a ../framework/liboperator.a ../framework/libtrainer_desc_proto.a ../framework/libdata_feed_proto.a ../framework/libdata_transform.a ../framework/libdata_device_transform.a ../framework/libdata_type_transform.a ../framework/libdata_layout_transform.a ../framework/libtransfer_scope_cache.a ../framework/libop_kernel_type.a ../framework/libop_call_stack.a ../framework/libop_proto_maker.a ../framework/libunused_var_check.a ../framework/details/libnan_inf_utils.a ../framework/libphi_utils.a ../framework/libinfershape_utils.a ../framework/libshape_inference.a ../framework/libop_info.a ../framework/libno_need_buffer_vars_inference.a ../framework/libattribute.a ../../phi/libphi.a ../../phi/ops/compat/libop_compat_infos.a ../../phi/api/lib/libapi_scalar.a ../../phi/api/lib/libtensor_copy.a ../../phi/api/lib/libkernel_dispatch.a ../../phi/api/lib/libapi_gen_utils.a ../../phi/api/lib/libphi_tensor_raw.a ../../phi/api/lib/libcontext_pool.a ../platform/libinit.a ../../phi/api/lib/libcontext_pool.a ../platform/libinit.a ../../phi/kernels/libconv_grad_kernel.a ../../phi/kernels/libconv_grad_kernel_base.a ../../phi/kernels/libconv_grad_kernel_gpudnn.a ../../phi/kernels/libconv_grad_grad_kernel.a ../../phi/kernels/libconv_grad_grad_kernel_base.a ../../phi/kernels/libconv_grad_grad_kernel_gpudnn.a ../../phi/kernels/libconv_kernel.a ../../phi/kernels/libconv_kernel_base.a ../../phi/kernels/libconv_kernel_gpudnn.a ../../phi/kernels/libconv_transpose_kernel.a ../../phi/kernels/libconv_transpose_kernel_base.a ../../phi/kernels/libconv_transpose_kernel_gpudnn.a ../../phi/kernels/libconv_transpose_grad_kernel.a ../../phi/kernels/libconv_transpose_grad_kernel_base.a ../../phi/kernels/libconv_transpose_grad_kernel_gpudnn.a ../../phi/kernels/autotune/libswitch_autotune.a ../../phi/kernels/autotune/libcache.a ../../phi/kernels/libcross_entropy_kernel.a ../operators/math/libcross_entropy.a ../../phi/kernels/libdeformable_conv_kernel.a ../../phi/kernels/libdeformable_conv_grad_kernel.a ../../phi/kernels/funcs/libdeformable_conv_functor.a ../../phi/kernels/libdeterminant_grad_kernel.a ../../phi/kernels/libeigh_kernel.a ../../phi/kernels/funcs/lapack/liblapack_function.a ../../phi/backends/dynload/libphi_dynload_lapack.a ../../phi/kernels/libhierarchical_sigmoid_kernel.a ../../phi/kernels/libhierarchical_sigmoid_grad_kernel.a ../operators/math/libmatrix_bit_code.a ../../phi/kernels/libgumbel_softmax_kernel.a ../../phi/kernels/libgumbel_softmax_grad_kernel.a ../../phi/kernels/libmatrix_power_kernel.a ../../phi/kernels/libmatrix_power_grad_kernel.a ../../phi/kernels/funcs/libmatrix_inverse.a ../../phi/kernels/libmaxout_kernel.a ../../phi/kernels/libmaxout_grad_kernel.a ../operators/math/libmaxouting.a ../../phi/kernels/libput_along_axis_kernel.a ../../phi/kernels/libput_along_axis_grad_kernel.a ../../phi/kernels/libsegment_pool_kernel.a ../../phi/kernels/libsegment_pool_grad_kernel.a ../../phi/kernels/funcs/libsegment_pooling.a ../../phi/kernels/libsoftmax_kernel.a ../../phi/kernels/libsoftmax_kernel_base.a ../../phi/kernels/libsoftmax_kernel_gpudnn.a ../../phi/kernels/libsoftmax_grad_kernel.a ../../phi/kernels/libsoftmax_grad_kernel_base.a ../../phi/kernels/libsoftmax_grad_kernel_gpudnn.a ../operators/math/libsoftmax.a ../operators/jit/libjit_kernel_helper.a ../operators/jit/refer/libjit_kernel_refer.a ../operators/jit/more/mkl/libjit_kernel_mkl.a ../operators/jit/more/intrinsic/libjit_kernel_intrinsic.a ../operators/jit/more/mix/libjit_kernel_mix.a ../operators/jit/gen/libjit_kernel_jitcode.a ../operators/jit/libjit_kernel_base.a ../../phi/kernels/libtake_along_axis_kernel.a ../../phi/kernels/libtake_along_axis_grad_kernel.a ../../phi/kernels/libtriangular_solve_grad_kernel.a ../../phi/kernels/funcs/libmatrix_reduce.a ../../phi/kernels/librnn_kernel.a ../../phi/kernels/librnn_grad_kernel.a ../../phi/kernels/funcs/liblstm_compute.a ../../phi/kernels/funcs/libgru_compute.a ../../phi/kernels/funcs/detail/libactivation_functions.a ../../phi/kernels/libwarpctc_kernel.a ../../phi/kernels/libwarpctc_grad_kernel.a ../operators/math/libsequence_padding.a ../operators/math/libsequence_scale.a ../../phi/backends/dynload/libphi_dynload_warpctc.a ../../phi/kernels/libabs_grad_kernel.a ../../phi/kernels/libaccuracy_kernel.a ../../phi/kernels/libadadelta_kernel.a ../../phi/kernels/libadagrad_kernel.a ../../phi/kernels/libadamax_kernel.a ../../phi/kernels/libadd_n_kernel.a ../../phi/kernels/libaddmm_grad_kernel.a ../../phi/kernels/libaddmm_kernel.a ../../phi/kernels/liballclose_kernel.a ../../phi/kernels/libarange_kernel.a ../../phi/kernels/libargsort_grad_kernel.a ../../phi/kernels/libargsort_kernel.a ../../phi/kernels/libatan2_grad_kernel.a ../../phi/kernels/libatan2_kernel.a ../../phi/kernels/libauc_kernel.a ../../phi/kernels/libbatch_norm_grad_kernel.a ../../phi/kernels/libbatch_norm_kernel.a ../../phi/kernels/libbatch_norm_kernel_base.a ../../phi/kernels/libbce_loss_grad_kernel.a ../../phi/kernels/libbce_loss_kernel.a ../../phi/kernels/libbernoulli_kernel.a ../../phi/kernels/libbilinear_tensor_product_grad_kernel.a ../../phi/kernels/libbilinear_tensor_product_kernel.a ../../phi/kernels/libbincount_kernel.a ../../phi/kernels/libbitwise_kernel.a ../../phi/kernels/libbroadcast_tensors_grad_kernel.a ../../phi/kernels/libbroadcast_tensors_kernel.a ../../phi/kernels/libcast_grad_kernel.a ../../phi/kernels/libcholesky_grad_kernel.a ../../phi/kernels/libcholesky_kernel.a ../../phi/kernels/libcholesky_solve_grad_kernel.a ../../phi/kernels/libcholesky_solve_kernel.a ../../phi/kernels/libclip_grad_kernel.a ../../phi/kernels/libclip_kernel.a ../../phi/kernels/libcompare_kernel.a ../../phi/kernels/libcomplex_grad_kernel.a ../../phi/kernels/libconcat_grad_kernel.a ../../phi/kernels/libconcat_kernel.a ../../phi/kernels/libcross_entropy_grad_kernel.a ../../phi/kernels/libcross_grad_kernel.a ../../phi/kernels/libcross_kernel.a ../../phi/kernels/libcumprod_grad_kernel.a ../../phi/kernels/libcumprod_kernel.a ../../phi/kernels/libcumsum_kernel.a ../../phi/kernels/libdepthwise_conv_grad_kernel.a ../../phi/kernels/libdepthwise_conv_kernel.a ../../phi/kernels/libdeterminant_kernel.a ../../phi/kernels/libdiag_grad_kernel.a ../../phi/kernels/libdiag_kernel.a ../../phi/kernels/libdiagonal_grad_kernel.a ../../phi/kernels/libdiagonal_kernel.a ../../phi/kernels/libdigamma_grad_kernel.a ../../phi/kernels/libdigamma_kernel.a ../../phi/kernels/libdist_grad_kernel.a ../../phi/kernels/libdist_kernel.a ../../phi/kernels/libdot_grad_kernel.a ../../phi/kernels/libdot_kernel.a ../../phi/kernels/libdropout_grad_kernel.a ../../phi/kernels/libdropout_kernel.a ../../phi/kernels/libeigh_grad_kernel.a ../../phi/kernels/libeinsum_grad_kernel.a ../../phi/kernels/libeinsum_kernel.a ../../phi/kernels/libelementwise_grad_kernel.a ../../phi/kernels/libembedding_grad_kernel.a ../../phi/kernels/liberf_grad_kernel.a ../../phi/kernels/liberf_kernel.a ../../phi/kernels/liberfinv_grad_kernel.a ../../phi/kernels/liberfinv_kernel.a ../../phi/kernels/libexpand_as_grad_kernel.a ../../phi/kernels/libexpand_as_kernel.a ../../phi/kernels/libexpand_grad_kernel.a ../../phi/kernels/libeye_kernel.a ../../phi/kernels/libflatten_grad_kernel.a ../../phi/kernels/libflatten_kernel.a ../../phi/kernels/libflip_kernel.a ../../phi/kernels/libfrobenius_norm_grad_kernel.a ../../phi/kernels/libfrobenius_norm_kernel.a ../../phi/kernels/libgather_grad_kernel.a ../../phi/kernels/libgather_kernel.a ../../phi/kernels/libgather_nd_grad_kernel.a ../../phi/kernels/libgather_nd_kernel.a ../../phi/kernels/libgather_tree_kernel.a ../../phi/kernels/libgaussian_random_kernel.a ../../phi/kernels/libgelu_grad_kernel.a ../../phi/kernels/libgelu_kernel.a ../../phi/kernels/libgraph_reindex_kernel.a ../../phi/kernels/libgraph_sample_neighbors_kernel.a ../../phi/kernels/libgraph_send_recv_grad_kernel.a ../../phi/kernels/libgraph_send_recv_kernel.a ../../phi/kernels/libgrid_sample_grad_kernel.a ../../phi/kernels/libgrid_sample_kernel.a ../../phi/kernels/libhistogram_kernel.a ../../phi/kernels/libhuber_loss_grad_kernel.a ../../phi/kernels/libhuber_loss_kernel.a ../../phi/kernels/libincrement_kernel.a ../../phi/kernels/libindex_sample_grad_kernel.a ../../phi/kernels/libindex_sample_kernel.a ../../phi/kernels/libindex_select_grad_kernel.a ../../phi/kernels/libindex_select_kernel.a ../../phi/kernels/libinterpolate_grad_kernel.a ../../phi/kernels/libinterpolate_kernel.a ../../phi/kernels/libis_empty_kernel.a ../../phi/kernels/libisclose_kernel.a ../../phi/kernels/libisfinite_kernel.a ../../phi/kernels/libkldiv_loss_grad_kernel.a ../../phi/kernels/libkldiv_loss_kernel.a ../../phi/kernels/libkron_grad_kernel.a ../../phi/kernels/libkron_kernel.a ../../phi/kernels/libkthvalue_grad_kernel.a ../../phi/kernels/libkthvalue_kernel.a ../../phi/kernels/liblabel_smooth_grad_kernel.a ../../phi/kernels/liblabel_smooth_kernel.a ../../phi/kernels/liblayer_norm_grad_kernel.a ../../phi/kernels/liblayer_norm_kernel.a ../../phi/kernels/liblerp_grad_kernel.a ../../phi/kernels/liblerp_kernel.a ../../phi/kernels/liblgamma_grad_kernel.a ../../phi/kernels/liblgamma_kernel.a ../../phi/kernels/liblinspace_kernel.a ../../phi/kernels/liblog_loss_grad_kernel.a ../../phi/kernels/liblog_loss_kernel.a ../../phi/kernels/liblog_softmax_grad_kernel.a ../../phi/kernels/liblog_softmax_kernel.a ../../phi/kernels/liblogical_kernel.a ../../phi/kernels/liblogsumexp_grad_kernel.a ../../phi/kernels/liblogsumexp_kernel.a ../../phi/kernels/libmasked_select_grad_kernel.a ../../phi/kernels/libmasked_select_kernel.a ../../phi/kernels/libmatmul_grad_kernel.a ../../phi/kernels/libcomplex_kernel.a ../../phi/kernels/libmatmul_kernel.a ../../phi/kernels/libmatrix_rank_kernel.a ../../phi/kernels/libmatrix_rank_tol_kernel.a ../../phi/kernels/libreduce_sum_kernel.a ../../phi/kernels/libreduce_sum_kernel_base.a ../../phi/kernels/libabs_kernel.a ../../phi/kernels/libmean_all_grad_kernel.a ../../phi/kernels/libmean_all_kernel.a ../../phi/kernels/libmeshgrid_grad_kernel.a ../../phi/kernels/libmeshgrid_kernel.a ../../phi/kernels/libmode_grad_kernel.a ../../phi/kernels/libmode_kernel.a ../../phi/kernels/libmomentum_kernel.a ../../phi/kernels/libmulti_dot_grad_kernel.a ../../phi/kernels/libmulti_dot_kernel.a ../../phi/kernels/libmultinomial_kernel.a ../../phi/kernels/libarg_min_max_kernel.a ../../phi/kernels/libmultiplex_grad_kernel.a ../../phi/kernels/libmultiplex_kernel.a ../../phi/kernels/libmv_grad_kernel.a ../../phi/kernels/libmv_kernel.a ../../phi/kernels/libnll_loss_grad_kernel.a ../../phi/kernels/libnll_loss_kernel.a ../../phi/kernels/libnll_loss_kernel_base.a ../../phi/kernels/libnorm_grad_kernel.a ../../phi/kernels/libnorm_kernel.a ../../phi/kernels/libone_hot_kernel.a ../../phi/kernels/libone_hot_kernel_base.a ../../phi/kernels/libp_norm_grad_kernel.a ../../phi/kernels/libp_norm_kernel.a ../../phi/kernels/libpad3d_grad_kernel.a ../../phi/kernels/libpad3d_kernel.a ../../phi/kernels/libpad_grad_kernel.a ../../phi/kernels/libpad_kernel.a ../../phi/kernels/libpixel_shuffle_grad_kernel.a ../../phi/kernels/libpixel_shuffle_kernel.a ../../phi/kernels/libpoisson_grad_kernel.a ../../phi/kernels/libpoisson_kernel.a ../../phi/kernels/libpool_grad_kernel.a ../../phi/kernels/libpool_grad_kernel_base.a ../../phi/kernels/libpool_grad_kernel_gpudnn.a ../../phi/kernels/libpool_kernel.a ../../phi/kernels/libpool_kernel_base.a ../../phi/kernels/libpool_kernel_gpudnn.a ../../phi/kernels/funcs/libpooling.a ../../phi/kernels/libprelu_grad_kernel.a ../../phi/kernels/libprelu_kernel.a ../../phi/kernels/libpsroi_pool_grad_kernel.a ../../phi/kernels/libpsroi_pool_kernel.a ../../phi/kernels/libqr_kernel.a ../../phi/kernels/librandperm_kernel.a ../../phi/kernels/librandint_kernel.a ../../phi/kernels/libreduce_all_kernel.a ../../phi/kernels/libreduce_all_kernel_base.a ../../phi/kernels/libreduce_any_kernel.a ../../phi/kernels/libreduce_any_kernel_base.a ../../phi/kernels/libreduce_max_grad_kernel.a ../../phi/kernels/libreduce_max_kernel.a ../../phi/kernels/libreduce_max_kernel_base.a ../../phi/kernels/libreduce_mean_grad_kernel.a ../../phi/kernels/libreduce_mean_kernel.a ../../phi/kernels/libreduce_mean_kernel_base.a ../../phi/kernels/libreduce_min_grad_kernel.a ../../phi/kernels/libreduce_min_kernel.a ../../phi/kernels/libreduce_min_kernel_base.a ../../phi/kernels/libreduce_prod_grad_kernel.a ../../phi/kernels/libreduce_prod_kernel.a ../../phi/kernels/libreduce_prod_kernel_base.a ../../phi/kernels/libreduce_sum_grad_kernel.a ../../phi/kernels/libcast_kernel.a ../../phi/kernels/libreshape_grad_kernel.a ../../phi/kernels/libreshape_kernel.a ../../phi/kernels/libreverse_kernel.a ../../phi/kernels/libreverse_kernel_base.a ../../phi/kernels/librmsprop_kernel.a ../../phi/kernels/libroi_align_grad_kernel.a ../../phi/kernels/libroi_align_kernel.a ../../phi/kernels/libroi_pool_grad_kernel.a ../../phi/kernels/libroi_pool_kernel.a ../../phi/kernels/libroll_grad_kernel.a ../../phi/kernels/libroll_kernel.a ../../phi/kernels/libscatter_grad_kernel.a ../../phi/kernels/libscatter_kernel.a ../../phi/kernels/libscatter_nd_add_grad_kernel.a ../../phi/kernels/libscatter_nd_add_kernel.a ../../phi/kernels/libsearchsorted_kernel.a ../../phi/kernels/libselu_grad_kernel.a ../../phi/kernels/libselu_kernel.a ../../phi/kernels/libset_value_grad_kernel.a ../../phi/kernels/libset_value_kernel.a ../../phi/kernels/libsgd_kernel.a ../../phi/kernels/libshard_index_kernel.a ../../phi/kernels/libsigmoid_cross_entropy_with_logits_grad_kernel.a ../../phi/kernels/libsigmoid_cross_entropy_with_logits_kernel.a ../../phi/kernels/libsign_kernel.a ../../phi/kernels/libsize_kernel.a ../../phi/kernels/libslice_grad_kernel.a ../../phi/kernels/libslice_kernel.a ../../phi/kernels/libsparse_weight_embedding_grad_kernel.a ../../phi/kernels/libsparse_weight_embedding_kernel.a ../../phi/kernels/libembedding_kernel.a ../../phi/kernels/libsplit_kernel.a ../../phi/kernels/libsqueeze_grad_kernel.a ../../phi/kernels/libsqueeze_kernel.a ../../phi/kernels/libstack_grad_kernel.a ../../phi/kernels/libstack_kernel.a ../../phi/kernels/libstrided_slice_grad_kernel.a ../../phi/kernels/libstrided_slice_grad_kernel_base.a ../../phi/kernels/libstrided_slice_kernel.a ../../phi/kernels/libstrided_slice_kernel_base.a ../../phi/kernels/libtemporal_shift_grad_kernel.a ../../phi/kernels/libtemporal_shift_kernel.a ../../phi/kernels/libtile_grad_kernel.a ../../phi/kernels/libtile_kernel.a ../../phi/kernels/libtop_k_grad_kernel.a ../../phi/kernels/libtop_k_kernel.a ../../phi/kernels/libtrace_grad_kernel.a ../../phi/kernels/libtrace_kernel.a ../../phi/kernels/libtransfer_layout_kernel.a ../../phi/kernels/libtranspose_grad_kernel.a ../../phi/kernels/libtriangular_solve_kernel.a ../../phi/kernels/libexpand_kernel.a ../../phi/kernels/libtril_triu_grad_kernel.a ../../phi/kernels/libtril_triu_kernel.a ../../phi/kernels/libtrunc_grad_kernel.a ../../phi/kernels/libtrunc_kernel.a ../../phi/kernels/libtruncated_gaussian_random_kernel.a ../../phi/kernels/libunbind_kernel.a ../../phi/kernels/libunfold_grad_kernel.a ../../phi/kernels/libunfold_kernel.a ../../phi/kernels/libunique_kernel.a ../../phi/kernels/libunsqueeze_grad_kernel.a ../../phi/kernels/libunsqueeze_kernel.a ../../phi/kernels/libunstack_grad_kernel.a ../../phi/kernels/libunstack_kernel.a ../../phi/kernels/libviterbi_decode_kernel.a ../../phi/kernels/libtranspose_kernel.a ../../phi/kernels/libwhere_grad_kernel.a ../../phi/kernels/libwhere_index_kernel.a ../../phi/kernels/libwhere_kernel.a ../../phi/kernels/libyolo_box_kernel.a ../../phi/kernels/libyolov3_loss_grad_kernel.a ../../phi/kernels/libyolov3_loss_kernel.a ../../phi/kernels/sparse/libactivation_grad_kernel_sp.a ../../phi/kernels/libactivation_grad_kernel.a ../../phi/kernels/sparse/libactivation_kernel_sp.a ../../phi/kernels/sparse/libconvolution_grad_kernel_sp.a ../../phi/kernels/sparse/libconvolution_kernel_sp.a ../../phi/kernels/sparse/libcopy_kernel_sp.a ../../phi/kernels/sparse/libsparse_pool_grad_kernel_sp.a ../../phi/kernels/sparse/libsparse_pool_kernel_sp.a ../../phi/kernels/sparse/libsparse_utils_grad_kernel_sp.a ../../phi/kernels/sparse/libsparse_mask_kernel_sp.a ../../phi/kernels/sparse/libsparse_utils_kernel_sp.a ../../phi/kernels/selected_rows/libactivation_kernel_sr.a ../../phi/kernels/libactivation_kernel.a ../../phi/kernels/selected_rows/libassign_kernel_sr.a ../../phi/kernels/libassign_kernel.a ../../phi/kernels/selected_rows/libclip_kernel_sr.a ../../phi/kernels/selected_rows/libcopy_kernel_sr.a ../../phi/kernels/selected_rows/libelementwise_kernel_sr.a ../../phi/kernels/libelementwise_kernel.a ../../phi/kernels/libelementwise_kernel_base.a ../../phi/kernels/selected_rows/libfull_kernel_sr.a ../../phi/kernels/selected_rows/libhierarchical_sigmoid_grad_kernel_sr.a ../../phi/kernels/selected_rows/libisfinite_kernel_sr.a ../../phi/kernels/selected_rows/libscale_kernel_sr.a ../../phi/kernels/libscale_kernel.a ../../phi/kernels/selected_rows/libshape_kernel_sr.a ../../phi/kernels/libshape_kernel.a ../../phi/kernels/selected_rows/libuniform_random_kernel_sr.a ../../phi/kernels/libuniform_random_kernel.a ../../phi/kernels/strings/libstrings_copy_kernel.a ../../phi/kernels/libcopy_kernel.a ../../phi/kernels/libfull_kernel.a ../../phi/kernels/libfull_kernel_base.a ../../phi/kernels/libempty_kernel.a ../operators/math/libselected_rows_functor.a ../operators/mkldnn/libmkldnn_axpy_handler.a ../operators/math/libvol2col.a ../operators/math/libim2col.a ../../phi/infermeta/libinfermeta.a ../../phi/kernels/funcs/libconcat_and_split_functor.a ../../phi/kernels/strings/libstrings_empty_kernel.a ../../phi/kernels/strings/libstrings_lower_upper_kernel.a ../../phi/kernels/funcs/eigen/libeigen_function.a ../../phi/kernels/funcs/libmath_function.a ../../phi/core/libkernel_context.a ../../phi/backends/libphi_context.a ../../phi/core/liblod_utils.a ../../phi/core/libsparse_csr_tensor.a ../../phi/core/libsparse_coo_tensor.a ../../phi/kernels/funcs/blas/libblas.a ../../phi/core/libcustom_kernel.a ../../phi/core/libkernel_factory.a ../../phi/infermeta/strings/libstring_infermeta.a ../../phi/core/libinfermeta_utils.a ../../phi/core/libmeta_tensor.a ../../phi/kernels/strings/libunicode.a ../../../third_party/install/utf8proc/lib/libutf8proc.a ../../phi/api/lib/utils/libphi_api_utils.a ../../phi/core/libstring_tensor.a ../../phi/core/compat/libconvert_utils.a ../../phi/core/compat/libop_utils.a ../../phi/core/compat/libarg_map_context.a ../../phi/common/libscalar.a ../operators/libgather_scatter_kernel.a ../framework/libselected_rows_utils.a ../framework/liblod_tensor.a ../memory/libmemory.a ../platform/libdevice_context.a ../framework/libscope.a ../../phi/core/libdense_tensor.a ../framework/libvar_type_traits.a ../framework/libtensor.a ../framework/libmixed_vector.a ../../phi/core/libselected_rows.a ../memory/libmalloc.a ../memory/libmemcpy.a ../platform/libprofiler.a ../../phi/backends/cpu/libcpu_context.a ../../phi/backends/gpu/libgpu_context.a ../../phi/backends/custom/libcustom_context.a ../framework/libdata_type.a ../../phi/backends/libdevice_manager.a ../framework/libfluid_convert_utils.a ../memory/allocation/liballocator_facade.a ../platform/profiler/libnew_profiler.a ../../phi/core/libphi_device_context.a ../../phi/backends/gpu/libgpu_resources.a ../../phi/backends/custom/libcustom_device.a ../memory/allocation/liballocator_strategy.a ../platform/device/gpu/cuda/libcuda_graph.a ../platform/profiler/libhost_tracer.a ../../phi/backends/libdevice_base.a ../memory/allocation/libstream_safe_cuda_allocator.a ../memory/allocation/libcustom_allocator.a ../memory/allocation/libnaive_best_fit_allocator.a ../framework/libselected_rows_utils.a ../framework/liblod_tensor.a ../memory/libmemory.a ../platform/libdevice_context.a ../framework/libscope.a ../../phi/core/libdense_tensor.a ../framework/libvar_type_traits.a ../framework/libtensor.a ../framework/libmixed_vector.a ../../phi/core/libselected_rows.a ../memory/libmalloc.a ../memory/libmemcpy.a ../platform/libprofiler.a ../../phi/backends/cpu/libcpu_context.a ../../phi/backends/gpu/libgpu_context.a ../../phi/backends/custom/libcustom_context.a ../framework/libdata_type.a ../../phi/backends/libdevice_manager.a ../framework/libfluid_convert_utils.a ../memory/allocation/liballocator_facade.a ../platform/profiler/libnew_profiler.a ../../phi/core/libphi_device_context.a ../../phi/backends/gpu/libgpu_resources.a ../../phi/backends/custom/libcustom_device.a ../memory/allocation/liballocator_strategy.a ../platform/device/gpu/cuda/libcuda_graph.a ../platform/profiler/libhost_tracer.a ../../phi/backends/libdevice_base.a ../memory/allocation/libstream_safe_cuda_allocator.a ../memory/allocation/libcustom_allocator.a ../memory/allocation/libnaive_best_fit_allocator.a ../framework/libversion.a ../framework/libgenerator.a ../platform/libstream_callback_manager.a ../../utils/string/libstringpiece.a ../platform/libcpu_helper.a ../../../libcblas.a ../platform/dynload/libdynload_mklml.a ../../phi/backends/dynload/libphi_dynload_mklml.a -L/mnt/work/Paddle/build/third_party/install/mklml/lib -liomp5 -Wl,--as-needed ../platform/libcudnn_workspace_helper.a ../platform/device/gpu/libgpu_resource_pool.a ../../../third_party/install/xxhash/lib/libxxhash.a ../framework/libthreadpool.a ../../phi/core/libtensor_meta.a ../../phi/core/libtensor_base.a ../platform/libdevice_tracer.a ../platform/stream/libcuda_stream.a ../platform/libprofiler_proto.a ../../../libmkldnn.a ../../../third_party/install/mkldnn/lib/libdnnl.so ../../../third_party/install/mklml/lib/libiomp5.so ../framework/libframework_proto.a ../platform/profiler/libcuda_tracer.a ../platform/profiler/libcpu_utilization.a ../platform/libos_info.a ../platform/profiler/libevent_bind.a ../platform/profiler/libprofiler_logger.a ../platform/profiler/libprofiler_utils.a ../platform/profiler/dump/libnodetreeproto.a ../platform/profiler/libevent_node.a ../platform/profiler/mlu/libmlu_tracer.a ../framework/new_executor/workqueue/libworkqueue_utils.a ../memory/allocation/libcuda_allocator.a ../memory/allocation/libcuda_managed_allocator.a ../memory/allocation/libpinned_allocator.a ../platform/libcuda_device_guard.a ../memory/allocation/libthread_local_allocator.a ../memory/allocation/libcuda_virtual_mem_allocator.a ../memory/allocation/libcpu_allocator.a ../memory/allocation/liblocked_allocator.a ../memory/allocation/libretry_allocator.a ../memory/allocation/libbuffered_allocator.a ../memory/allocation/libauto_growth_best_fit_allocator.a ../memory/allocation/libvirtual_memory_auto_growth_best_fit_allocator.a ../memory/allocation/libaligned_allocator.a ../memory/allocation/libbest_fit_allocator.a ../../phi/core/libddim.a ../../phi/backends/libstream.a ../../phi/backends/libevent.a ../../phi/backends/libcallback_manager.a ../../phi/backends/libdevice_guard.a ../memory/allocation/liballocator.a ../memory/libstats.a ../memory/detail/libbuddy_allocator.a ../memory/detail/libmemory_block.a ../memory/detail/libsystem_allocator.a ../platform/libplace.a ../../phi/common/libphi_place.a ../platform/libcpu_info.a ../platform/device/gpu/libgpu_info.a ../platform/libmonitor.a ../platform/dynload/libdynload_cuda.a ../platform/dynload/libdynamic_loader.a ../../phi/backends/gpu/libphi_gpu_info.a ../../phi/backends/gpu/cuda/libphi_cuda_info.a ../../phi/backends/dynload/libphi_dynload_cuda.a ../../phi/backends/dynload/libphi_dynamic_loader.a ../platform/libenforce.a ../../phi/core/libphi_enforce.a ../platform/libflags.a ../../../third_party/install/gflags/lib/libgflags.a ../../phi/core/liberrors.a ../platform/libexternal_error_proto.a -lpthread ../../../third_party/install/glog/lib/libglog.a ../../../third_party/install/protobuf/lib/libprotobuf.a -lcudadevrt -lcudart -lcuda -lcuda  -pthread -ldl -lrt

实际的main函数所在文件op_function_generator的编译命令

cd /mnt/work/Paddle/build/paddle/fluid/pybind && /usr/bin/c++ -DCUDA_TOOLKIT_ROOT_DIR=\"/mnt/work/PPU_SDK/CUDA_SDK\" -DCUDA_VERSION_MAJOR=\"11\" -DCUDA_VERSION_MINOR=\"1\" -DCUDNN_MAJOR_VERSION=\"8\" -DEIGEN_USE_GPU -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CUDA -DPADDLE_WITH_CUPTI -DPADDLE_WITH_CUSTOM_DEVICE -DPADDLE_WITH_MKLDNN -DPADDLE_WITH_MKLML -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_XBYAK -DTRT_PLUGIN_FP16_AVALIABLE -DXBYAK64 -DXBYAK_NO_OP_NAMES -I/mnt/work/Paddle/build -I/mnt/work/Paddle/paddle/fluid/framework/io -I/mnt/work/Paddle/patches/thrust -I/mnt/work/Paddle/build/third_party/install/zlib/include -I/mnt/work/Paddle/build/third_party/install -I/mnt/work/Paddle/build/third_party/install/gflags/include -I/mnt/work/Paddle/build/third_party/install/glog/include -I/mnt/work/Paddle/build/third_party/boost/src/extern_boost -I/mnt/work/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/mnt/work/Paddle/build/third_party/threadpool/src/extern_threadpool -I/mnt/work/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/mnt/work/Paddle/build/third_party/install/xxhash/include -I/mnt/work/Paddle/build/third_party/install/warpctc/include -I/mnt/work/Paddle/build/third_party/install/utf8proc/include -I/mnt/work/Paddle/build/third_party/install/mklml/include -I/mnt/work/Paddle/build/third_party/install/mkldnn/include -I/mnt/work/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.8 -I/usr/local/lib/python3.8/dist-packages/numpy/core/include -I/mnt/work/Paddle/build/third_party/pybind/src/extern_pybind/include -I/mnt/work/Paddle/build/third_party/install/gloo/include -I/mnt/work/Paddle/build/third_party/install/xbyak/include -I/mnt/work/Paddle/build/third_party/install/xbyak/include/xbyak -I/mnt/work/Paddle/build/third_party/pocketfft/src -I/mnt/work/PPU_SDK/CUDA_SDK/extras/CUPTI/include -I/mnt/work/PPU_SDK/CUDA_SDK/targets/x86_64-linux/include -I/mnt/work/PPU_SDK/CUDA_SDK/include -I/mnt/work/Paddle -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -fopenmp -mavx -O3 -DNDEBUG -MD -MT paddle/fluid/pybind/CMakeFiles/op_function_generator.dir/op_function_generator.cc.o -MF CMakeFiles/op_function_generator.dir/op_function_generator.cc.o.d -o CMakeFiles/op_function_generator.dir/op_function_generator.cc.o -c /mnt/work/Paddle/paddle/fluid/pybind/op_function_generator.cc

前边都是正常的,在最后一部分有这样 -fopenmp -mavx -O3 -DNDEBUG -MD -MT paddle/fluid/pybind/CMakeFiles/op_function_generator.dir/op_function_generator.cc.o -MF CMakeFiles/op_function_generator.dir/op_function_generator.cc.o.d -o CMakeFiles/op_function_generator.dir/op_function_generator.cc.o -c /mnt/work/Paddle/paddle/fluid/pybind/op_function_generator.cc 的字段,将-O3改为 -O0后,重新link,居然就正常了,但是其他文件这种改动并没有效果,因此判断不是编译优化太高的问题。

由于报错信息是free(): invalid pointer,使用valgrind来排查相关内存问题

valgrind   --tool=memcheck   --leak-check=full  ./op_function_generator &> valgrind.log

得到log信息

==6891== Memcheck, a memory error detector
==6891== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==6891== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==6891== Command: ./kernel_signature_generator
==6891== 
==6891== Warning: set address range perms: large range [0x6d02000, 0x1ab19000) (defined)
==6891== Invalid free() / delete / delete[] / realloc()
==6891==    at 0x1B495FBF: operator delete(void*) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==6891==    by 0x94363F: std::pair<paddle::detailv3::sherwood_v3_table<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, paddle::detailv3::KeyOrValueHasher<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, paddle::detailv3::KeyOrValueEquality<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > >, std::allocator<paddle::detailv3::sherwood_v3_entry<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > > > >::templated_iterator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > >, bool> paddle::detailv3::sherwood_v3_table<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, paddle::detailv3::KeyOrValueHasher<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, paddle::detailv3::KeyOrValueEquality<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > >, std::allocator<paddle::detailv3::sherwood_v3_entry<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > > > >::emplace_new_key<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, paddle::flat_hash_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > > >::convertible_to_value>(signed char, paddle::detailv3::sherwood_v3_entry<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > >*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >&&, paddle::flat_hash_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<paddle::framework::OpKernelType, std::function<void (paddle::framework::ExecutionContext const&)>, paddle::framework::OpKernelType::Hash, std::equal_to<paddle::framework::OpKernelType>, std::allocator<std::pair<paddle::framework::OpKernelType const, std::function<void (paddle::framework::ExecutionContext const&)> > > > > > >::convertible_to_value&&) (in /mnt/work/Paddle/build/paddle/fluid/pybind/kernel_signature_generator)

在这里,捕获到了非常重要的数据结构sherwood_v3_table查看其源码paddle/utils/flat_hash_map.h,应该是一个高效的hash_map实现,源码太长,这里不好粘贴,简单看下来,感觉应该是初始化函数和dealloc函数两边不匹配的问题,修改初始化和dealloc的部分。

diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h
    index 64a75fffa5..eae4c00612 100644
    --- a/paddle/utils/flat_hash_map.h
    +++ b/paddle/utils/flat_hash_map.h
    @@ -123,12 +123,12 @@ static constexpr int8_t min_lookups = 4;
template <typename T>
struct sherwood_v3_entry {
sherwood_v3_entry() {}
-  sherwood_v3_entry(int8_t distance_from_desired)
-      : distance_from_desired(distance_from_desired) {}
+  sherwood_v3_entry(int8_t distance_from_desired, bool isDefault = false)
+      : distance_from_desired(distance_from_desired), isDefault(isDefault){}
~sherwood_v3_entry() {}
static sherwood_v3_entry *empty_default_table() {
    static sherwood_v3_entry result[min_lookups] = {
    -        {}, {}, {}, {special_end_value}};
    +      {-1, true}, {}, {}, {special_end_value} };
return result;
}

@@ -147,6 +147,7 @@ struct sherwood_v3_entry {
}

int8_t distance_from_desired = -1;
+  bool isDefault = false;
static constexpr int8_t special_end_value = 0;
union {
T value;
@@ -745,7 +746,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
void deallocate_data(EntryPointer begin,
size_t num_slots_minus_one,
int8_t max_lookups) {
    -    if (begin != Entry::empty_default_table()) {
        +    if (!static_cast<sherwood_v3_entry<T>*>(begin)[0].isDefault) {
        AllocatorTraits::deallocate(
        *this, begin, num_slots_minus_one + max_lookups + 1);
}

参考至:https://github.com/amzeratul/halley/commit/37cc635cab7fd9d5f34ca2016731f465ec6f260e

13. 测试

修改代码后,编译,顺利得到whl包。

make -j32
pip install  --force-reinstall ./python/dist/paddlepaddle_gpu-0.0.0-cp38-cp38-linux_x86_64.whl

按照paddle官网提供的mnist测试小程序test_mnist.py

import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
import os
import numpy as np


class MNIST(paddle.nn.Layer):
    def __init__(self):
        super(MNIST, self).__init__()

        # 定义一层全连接层,输出维度是1
        self.fc = paddle.nn.Linear(in_features=784, out_features=1)

        # 定义网络结构的前向计算过程
    def forward(self, inputs):
        outputs = self.fc(inputs)
        return outputs

        # 声明网络结构
model = MNIST()

# 图像归一化函数,将数据范围为[0, 255]的图像归一化到[0, 1]
def norm_img(img):
    # 验证传入数据格式是否正确,img的shape为[batch_size, 28, 28]
    assert len(img.shape) == 3
    batch_size, img_h, img_w = img.shape[0], img.shape[1], img.shape[2]
    # 归一化图像数据
    img = img / 255
    # 将图像形式reshape为[batch_size, 784]
    img = paddle.reshape(img, [batch_size, img_h*img_w])

    return img
    # 确保从paddle.vision.datasets.MNIST中加载的图像数据是np.ndarray类型
paddle.vision.set_image_backend('cv2')

# 声明网络结构
model = MNIST()

def train(model):
    # 启动训练模式
    model.train()
    # 加载训练集 batch_size 设为 16
    train_loader = paddle.io.DataLoader(paddle.vision.datasets.MNIST(mode='train'), 
                                        batch_size=16, 
                                        shuffle=True)
    # 定义优化器,使用随机梯度下降SGD优化器,学习率设置为0.001
    opt = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters())
    EPOCH_NUM = 10
    for epoch in range(EPOCH_NUM):
        for batch_id, data in enumerate(train_loader()):
            images = norm_img(data[0]).astype('float32')
            labels = data[1].astype('float32')

            #前向计算的过程
            predicts = model(images)

            # 计算损失
            loss = F.square_error_cost(predicts, labels)
            avg_loss = paddle.mean(loss)

            #每训练了1000批次的数据,打印下当前Loss的情况
            if batch_id % 1000 == 0:
                print("epoch_id: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, avg_loss.numpy()))

                #后向传播,更新参数的过程
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

train(model)
paddle.save(model.state_dict(), './mnist.pdparams')

最终测试

python test_minst.py
[umd rel]: get setting cfg file failed, will use default values.
[umd rel]: **********************************umd + hw*************************************
W1209 16:02:06.561496 103813 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.0, Runtime API Version: 11.0
W1209 16:02:06.561710 103813 gpu_resources.cc:91] device: 0, cuDNN Version: 8.0.
[2022-12-09 16:02:09.760884][ALINPU INFO]: [../../include/accontext.hpp:376] ACOMPUTE: [device caps] name=, sm8.0, smem=256KB, num_of_sm=64, freq_of_sm=1600MHz
[2022-12-09 16:02:09.762743][ALINPU INFO]: [../../include/accontext.hpp:201]ACOMPUTE: Got AIU enable flag: 1
epoch_id: 0, batch_id: 0, loss is: [26.36991]
epoch_id: 0, batch_id: 1000, loss is: [3.0229297]
epoch_id: 0, batch_id: 2000, loss is: [5.8770084]
epoch_id: 0, batch_id: 3000, loss is: [2.4487498]
epoch_id: 1, batch_id: 0, loss is: [5.7081146]
epoch_id: 1, batch_id: 1000, loss is: [4.6929483]
epoch_id: 1, batch_id: 2000, loss is: [3.3755846]
epoch_id: 1, batch_id: 3000, loss is: [3.0124931]
epoch_id: 2, batch_id: 0, loss is: [1.8503466]
...

整体测试通过,paddle打通结束。

如果还是会遇到invalid pointer的问题,可以将sherwood_v3_entry的算法更新一下,相关修改可以参照paddle_nv.patch进行。

paddle.patch

paddle_nv.patch