【博客转载】使用CUDA driver API在运行时加载CUDA kernel (文末送书)

博客来源:https://leimao.github.io/blog/CUDA-Driver-Runtime-Load-Run-Kernel/ ,来自Lei Mao,已获得作者转载授权。后续会转载一些Lei Mao的CUDA相关Blog,也是一个完整的专栏,Blog会从稍早一些的CUDA架构到当前最新的CUDA架构,也会包含实用工程技巧,底层指令分析,Cutlass分析等等多个课题,是一个时间线十分明确的专栏。

使用CUDA driver API在运行时加载CUDA kernel

简介

在某些情况下,我们希望动态打开一些库并运行库中的某些函数。如果某些库并不总是被使用,这可以为程序节省一些内存。要加载C和C++库和函数,我们可以使用dlopendlsym函数来打开共享库并获取用extern "C"指定的函数地址。

在CUDA中,有时CUDA kernel  没有被编译到库中,而是被编译成PTX、CUBIN或FATBIN文件。在这种情况下,我们可以使用CUDA driver API来加载PTX、CUBIN或FATBIN文件并运行CUDA kernel  。

构建PTX/CUBIN/FATBIN文件

CUDA kernel  没有头文件。CUDA kernel  必须用extern “C”指定。否则,CUDA driver API无法使用函数名找到CUDA kernel  。CUDA kernel 文件将被编译成PTX、CUBIN或FATBIN文件。

// Build commands
// ptx
// nvcc --ptx --gpu-architecture=compute_80 vector_add.cu -o vector_add.ptx
// cubin
// nvcc --cubin --gpu-architecture=compute_80 --gpu-code=sm_86 vector_add.cu -o
// vector_add.cubin
// fatbin
// nvcc --fatbin --gpu-architecture=compute_80 --gpu-code=sm_86 vector_add.cu -o
// vector_add.fatbin

#include <cuda_runtime.h>

// The extern "C" is necessary.
extern"C"__global__ void vector_add(int const* a, int const* b, int* c, unsigned int n)
{
    unsignedintconst stride{blockDim.x * gridDim.x};
    unsignedintconst start_idx{blockDim.x * blockIdx.x + threadIdx.x};
    for (unsignedint i{start_idx}; i < n; i += stride)
    {
        c[i] = a[i] + b[i];
    }
}

要将CUDA kernel  编译成PTX、CUBIN或FATBIN文件,请运行以下命令。请根据您使用的GPU架构和GPU SM调整--gpu-architecture--gpu-code标志。

$ ls -l
-rw-rw-r-- 1 1000 1000   699 Feb 24 19:30 vector_add.cu
-rw-r--r-- 1 root root  2984 Feb 24 19:29 vector_add.cubin
-rw-r--r-- 1 root root  3072 Feb 24 19:29 vector_add.fatbin
-rw-r--r-- 1 root root  1279 Feb 24 19:29 vector_add.ptx

也可以使用CMake来构建PTX/CUBIN/FATBIN文件。在我们的情况下,我们可以使用以下CMakeLists.txt文件。

cmake_minimum_required(VERSION 3.28)

project(Build-CUBIN VERSION 0.0.1 LANGUAGES CXX CUDA)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

#
 Find CUDA Toolkit
find_package(CUDAToolkit REQUIRED)

#
 PTX
add_library(vector_add_ptx_sm86 OBJECT vector_add.cu)
set_target_properties(vector_add_ptx_sm86 PROPERTIES CUDA_PTX_COMPILATION ON)
set_target_properties(vector_add_ptx_sm86 PROPERTIES CUDA_ARCHITECTURES "86-virtual")

add_library(vector_add_ptx_sm80 OBJECT vector_add.cu)
set_target_properties(vector_add_ptx_sm80 PROPERTIES CUDA_PTX_COMPILATION ON)
set_target_properties(vector_add_ptx_sm80 PROPERTIES CUDA_ARCHITECTURES "80-virtual")

add_library(vector_add_ptx_sm70 OBJECT vector_add.cu)
set_target_properties(vector_add_ptx_sm70 PROPERTIES CUDA_PTX_COMPILATION ON)
set_target_properties(vector_add_ptx_sm70 PROPERTIES CUDA_ARCHITECTURES "70-virtual")

#
 CUBIN
add_library(vector_add_cubin_sm86 OBJECT vector_add.cu)
set_target_properties(vector_add_cubin_sm86 PROPERTIES CUDA_CUBIN_COMPILATION ON)
set_target_properties(vector_add_cubin_sm86 PROPERTIES CUDA_ARCHITECTURES "86-real")

add_library(vector_add_cubin_sm80 OBJECT vector_add.cu)
set_target_properties(vector_add_cubin_sm80 PROPERTIES CUDA_CUBIN_COMPILATION ON)
set_target_properties(vector_add_cubin_sm80 PROPERTIES CUDA_ARCHITECTURES "80-real")

add_library(vector_add_cubin_sm70 OBJECT vector_add.cu)
set_target_properties(vector_add_cubin_sm70 PROPERTIES CUDA_CUBIN_COMPILATION ON)
set_target_properties(vector_add_cubin_sm70 PROPERTIES CUDA_ARCHITECTURES "70-real")

#
 FATBIN
add_library(vector_add_fatbin OBJECT vector_add.cu)
set_target_properties(vector_add_fatbin PROPERTIES CUDA_FATBIN_COMPILATION ON)
set_target_properties(vector_add_fatbin PROPERTIES CUDA_ARCHITECTURES "80-virtual;80-real;80-real")

使用CUDA driver API加载和运行PTX/CUBIN/FATBIN文件

要在运行时从PTX、CUBIN或FATBIN文件中加载和运行CUDA kernel  ,需要使用CUDA driver API。

// Build command
// g++ run_vector_add.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64
// -lcuda -o run_vector_add

// 使用 CUDA 驱动 API 就足够了。
#include <cuda.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#define CHECK_CUDA_DRIVER_ERROR(val)                                           \
    check_driver((val), #val, __FILE__, __LINE__)

void check_driver(CUresult err, char const* func, char const* file, int line)
{
    if (err != CUDA_SUCCESS)
    {
        std::cerr << "CUDA Driver Error at: " << file << ":" << line
                  << std::endl;
        charconst* err_string{nullptr};
        std::cerr << cuGetErrorString(err, &err_string) << std::endl;
        std::cerr << err_string << " " << func << std::endl;
        std::exit(EXIT_FAILURE);
    }
}

int main(int argc, char** argv)
{
    // 检查命令行参数的数量。
    if (argc != 2)
    {
        std::cerr << "Usage: " << argv[0] << " <ptx_cubin_fatbin_file_path>"
                  << std::endl;
        return1;
    }
    // 从命令行读取 CUDA  kernel 库文件路径。
    std::stringconst ptx_cubin_fatbin_file_path{argv[1]};

    // 使用 CUDA 设备 0。
    CUdevice cuda_device{0};
    CUfunction vector_add_kernel{};
    CUmodule cuda_module{};
    CUcontext cuda_context{};
    CUstream cuda_stream{};

    // 目前,此标志必须为 0。
    unsignedintconst cuda_driver_init_flags{0};
    // 初始化 CUDA 驱动 API。
    CHECK_CUDA_DRIVER_ERROR(cuInit(cuda_driver_init_flags));
    // 目前,此标志必须为 0。
    unsignedintconst cuda_context_init_flags{0};
    // 创建 CUDA 上下文。
    CHECK_CUDA_DRIVER_ERROR(
        cuCtxCreate(&cuda_context, cuda_context_init_flags, cuda_device));
    // 创建 CUDA 流。
    CHECK_CUDA_DRIVER_ERROR(
        cuStreamCreate(&cuda_stream, CU_STREAM_NON_BLOCKING));

    // 确保 cubin/ptx/fatbin 文件存在。
    std::ifstream cuda_kernel_library_file{ptx_cubin_fatbin_file_path};
    if (!cuda_kernel_library_file)
    {
        std::cerr << "Error: The cubin/ptx/fatbin file does not exist."
                  << std::endl;
        return1;
    }

    // 从 cubin/ptx/fatbin 文件创建 CUDA 模块。
    CHECK_CUDA_DRIVER_ERROR(
        cuModuleLoad(&cuda_module, ptx_cubin_fatbin_file_path.c_str()));

    // 从 CUDA 模块获取 CUDA  kernel  。
    CHECK_CUDA_DRIVER_ERROR(
        cuModuleGetFunction(&vector_add_kernel, cuda_module, "vector_add"));

    // 向量中的元素数量。
    unsignedint num_elements{8192};

    std::vector<inthost_vector_a(num_elements, 1);
    std::vector<inthost_vector_b(num_elements, 2);
    std::vector<inthost_vector_c(num_elements, -1);
    std::vector<inthost_vector_c_reference(num_elements, -2);
    // 初始化输入向量。
    for (size_t i{0}; i < num_elements; ++i)
    {
        host_vector_a.at(i) = i;
        host_vector_b.at(i) = i;
    }
    // 计算参考结果。
    for (size_t i{0}; i < num_elements; ++i)
    {
        host_vector_c_reference.at(i) =
            host_vector_a.at(i) + host_vector_b.at(i);
    }

    // 为输入向量分配设备内存。
    CUdeviceptr device_vector_a{};
    CUdeviceptr device_vector_b{};
    CUdeviceptr device_vector_c{};
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_a, num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_b, num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_c, num_elements * sizeof(int)));

    // 将输入向量从主机复制到设备。
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyHtoD(device_vector_a, host_vector_a.data(),
                                         num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyHtoD(device_vector_b, host_vector_b.data(),
                                         num_elements * sizeof(int)));

    // 设置 kernel 参数。
    void* kernel_params[]{&device_vector_a, &device_vector_b, &device_vector_c,
                          &num_elements};
    // 启动 CUDA  kernel 。
    unsignedintconst block_size_x{256};
    unsignedintconst block_size_y{1};
    unsignedintconst block_size_z{1};
    unsignedintconst grid_size_x{(num_elements + block_size_x - 1) /
                                   block_size_x};
    unsignedintconst grid_size_y{1};
    unsignedintconst grid_size_z{1};
    unsignedintconst shared_memory_size{0};
    CHECK_CUDA_DRIVER_ERROR(cuLaunchKernel(
        vector_add_kernel, grid_size_x, grid_size_y, grid_size_z, block_size_x,
        block_size_y, block_size_z, shared_memory_size, cuda_stream,
        kernel_params, nullptr));
    CHECK_CUDA_DRIVER_ERROR(cuStreamSynchronize(cuda_stream));

    // 将结果向量从设备复制到主机。
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyDtoH(host_vector_c.data(), device_vector_c,
                                         num_elements * sizeof(int)));

    // 验证结果。
    for (size_t i{0}; i < num_elements; ++i)
    {
        if (host_vector_c.at(i) != host_vector_c_reference.at(i))
        {
            std::cerr << "Error: The result is incorrect." << std::endl;
            return1;
        }
    }

    // 释放设备内存。
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_a));
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_b));
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_c));

    // 销毁 CUDA 流。
    CHECK_CUDA_DRIVER_ERROR(cuStreamDestroy(cuda_stream));

    // 销毁 CUDA 模块。
    CHECK_CUDA_DRIVER_ERROR(cuModuleUnload(cuda_module));

    // 销毁 CUDA 上下文。
    CHECK_CUDA_DRIVER_ERROR(cuCtxDestroy(cuda_context));

    return0;
}

要构建程序,请运行以下命令。

$ g++ -o run_vector_add run_vector_add.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda

要运行程序,请运行以下命令。应该不会遇到错误。

$ ./run_vector_add vector_add.ptx
$ ./run_vector_add vector_add.cubin
$ ./run_vector_add vector_add.fatbin

使用CUDA driver API加载和运行PTX/CUBIN/FATBIN字符串

通常也会看到PTX、CUBIN或FATBIN字符串嵌入在C++源代码中。例如,TensorRT多头注意力 kernel cubins(https://github.com/NVIDIA/TensorRT/blob/97ff24489d0ea979c418c7a0847dfc14c8483846/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm80.cpp)在C++源代码中被表示为字符串。在这种情况下,PTX、CUBIN或FATBIN字符串使用一些脚本从PTX、CUBIN或FATBIN文件转换而来。

例如,我们可以使用以下Python脚本将我们刚才构建的PTX、CUBIN或FATBIN文件转换为PTX、CUBIN或FATBIN字符串。

# Open a specified file and convert every byte to a list of unsigned chars in hexadecimal format,
# then write the list to a new text file.
# Usage: python convert_file_to_chars.py --input_bin_file <input_file> --output_char_file <output_file>

import argparse
import os


def main():

    parser 
= argparse.ArgumentParser(
        description='Convert a binary file to a list of unsigned chars')
    parser.add_argument('--input_bin_file',
                        type=str,
                        help='The input binary file')
    parser.add_argument('--output_char_file',
                        type=str,
                        help='The output char file')

    args = parser.parse_args()

    ifnot os.path.isfile(args.input_bin_file):
        print('The input binary file does not exist')
        return

    with open(args.input_bin_file, 'rb') as f:
        content = f.read()
    chars = list(content)
    # Convert each char to a string in hexadecimal format "0x??"
    chars = ['0x{:02x}'.format(c) for c in chars]
    with open(args.output_char_file, 'w') as f:
        f.write(', '.join(chars))


if __name__ == "__main__":

    main()
$ python3 convert_file_to_chars.py --help
usage: convert_file_to_chars.py [-h] [--input_bin_file INPUT_BIN_FILE]
                                [--output_char_file OUTPUT_CHAR_FILE]

Convert a binary file to a list of unsigned chars

options:
  -h, --help            show this help message and exit
  --input_bin_file INPUT_BIN_FILE
                        The input binary file
  --output_char_file OUTPUT_CHAR_FILE
                        The output char file

PTX、CUBIN或FATBIN字符串应该然后被复制到C++源代码中。

// vector_add_ptx_cubin_fatbin_sm86.hpp
extern unsigned char vector_add_int_ptx[];
extern unsigned char const vector_add_int_cubin[];
extern unsigned char vector_add_int_fatbin[];
// vector_add_ptx_cubin_fatbin_sm86.cpp
#include "vector_add_ptx_cubin_fatbin_sm86.hpp"

unsignedchar vector_add_int_ptx[] = {
    0x2f0x2f0x0a0x2f0x2f0x200x470x650x6e0x650x720x61,
    0x740x650x640x200x620x790x200x4e0x560x490x440x49,
    0x410x200x4e0x560x560x4d0x200x430x6f0x6d0x700x69,
    0x6c0x650x720x0a0x2f0x2f0x0a0x2f0x2f0x200x430x6f,
    0x6d0x700x690x6c0x650x720x200x420x750x690x6c0x64,
    0x200x490x440x3a0x200x430x4c0x2d0x330x340x300x39,
    0x370x390x360x370x0a0x2f0x2f0x200x430x750x640x61,
    0x200x630x6f0x6d0x700x690x6c0x610x740x690x6f0x6e,
    0x200x740x6f0x6f0x6c0x730x2c0x200x720x650x6c0x65,
    0x610x730x650x200x310x320x2e0x340x2c0x200x560x31,
    0x320x2e0x340x2e0x310x330x310x0a0x2f0x2f0x200x42,
    0x610x730x650x640x200x6f0x6e0x200x4e0x560x560x4d,
    0x200x370x2e0x300x2e0x310x0a0x2f0x2f0x0a0x0a0x2e,
    0x760x650x720x730x690x6f0x6e0x200x380x2e0x340x0a,
    0x2e0x740x610x720x670x650x740x200x730x6d0x5f0x38,
    0x300x0a0x2e0x610x640x640x720x650x730x730x5f0x73,
    0x690x7a0x650x200x360x340x0a0x0a0x090x2f0x2f0x20,
    0x2e0x670x6c0x6f0x620x6c0x090x760x650x630x740x6f,
    0x720x5f0x610x640x640x0a0x0a0x2e0x760x690x730x69,
    0x620x6c0x650x200x2e0x650x6e0x740x720x790x200x76,
    0x650x630x740x6f0x720x5f0x610x640x640x280x0a0x09,
    0x2e0x700x610x720x610x6d0x200x2e0x750x360x340x20,
    0x760x650x630x740x6f0x720x5f0x610x640x640x5f0x70,
    0x610x720x610x6d0x5f0x300x2c0x0a0x090x2e0x700x61,
    0x720x610x6d0x200x2e0x750x360x340x200x760x650x63,
    0x740x6f0x720x5f0x610x640x640x5f0x700x610x720x61,
    0x6d0x5f0x310x2c0x0a0x090x2e0x700x610x720x610x6d,
    0x200x2e0x750x360x340x200x760x650x630x740x6f0x72,
    0x5f0x610x640x640x5f0x700x610x720x610x6d0x5f0x32,
    0x2c0x0a0x090x2e0x700x610x720x610x6d0x200x2e0x75,
    0x330x320x200x760x650x630x740x6f0x720x5f0x610x64,
    0x640x5f0x700x610x720x610x6d0x5f0x330x0a0x290x0a,
    0x7b0x0a0x090x2e0x720x650x670x200x2e0x700x720x65,
    0x640x200x090x250x700x3c0x330x3e0x3b0x0a0x090x2e,
    0x720x650x670x200x2e0x620x330x320x200x090x250x72,
    0x3c0x310x340x3e0x3b0x0a0x090x2e0x720x650x670x20,
    0x2e0x620x360x340x200x090x250x720x640x3c0x310x31,
    0x3e0x3b0x0a0x0a0x0a0x090x6c0x640x2e0x700x610x72,
    0x610x6d0x2e0x750x360x340x200x090x250x720x640x34,
    0x2c0x200x5b0x760x650x630x740x6f0x720x5f0x610x64,
    0x640x5f0x700x610x720x610x6d0x5f0x300x5d0x3b0x0a,
    0x090x6c0x640x2e0x700x610x720x610x6d0x2e0x750x36,
    0x340x200x090x250x720x640x350x2c0x200x5b0x760x65,
    0x630x740x6f0x720x5f0x610x640x640x5f0x700x610x72,
    0x610x6d0x5f0x310x5d0x3b0x0a0x090x6c0x640x2e0x70,
    0x610x720x610x6d0x2e0x750x360x340x200x090x250x72,
    0x640x360x2c0x200x5b0x760x650x630x740x6f0x720x5f,
    0x610x640x640x5f0x700x610x720x610x6d0x5f0x320x5d,
    0x3b0x0a0x090x6c0x640x2e0x700x610x720x610x6d0x2e,
    0x750x330x320x200x090x250x720x360x2c0x200x5b0x76,
    0x650x630x740x6f0x720x5f0x610x640x640x5f0x700x61,
    0x720x610x6d0x5f0x330x5d0x3b0x0a0x090x6d0x6f0x76,
    0x2e0x750x330x320x200x090x250x720x310x2c0x200x25,
    0x6e0x740x690x640x2e0x780x3b0x0a0x090x6d0x6f0x76,
    0x2e0x750x330x320x200x090x250x720x370x2c0x200x25,
    0x630x740x610x690x640x2e0x780x3b0x0a0x090x6d0x6f,
    0x760x2e0x750x330x320x200x090x250x720x380x2c0x20,
    0x250x740x690x640x2e0x780x3b0x0a0x090x6d0x610x64,
    0x2e0x6c0x6f0x2e0x730x330x320x200x090x250x720x31,
    0x330x2c0x200x250x720x310x2c0x200x250x720x370x2c,
    0x200x250x720x380x3b0x0a0x090x730x650x740x700x2e,
    0x670x650x2e0x750x330x320x200x090x250x700x310x2c,
    0x200x250x720x310x330x2c0x200x250x720x360x3b0x0a,
    0x090x400x250x700x310x200x620x720x610x200x090x24,
    0x4c0x5f0x5f0x420x420x300x5f0x330x3b0x0a0x0a0x09,
    0x6d0x6f0x760x2e0x750x330x320x200x090x250x720x39,
    0x2c0x200x250x6e0x630x740x610x690x640x2e0x780x3b,
    0x0a0x090x6d0x750x6c0x2e0x6c0x6f0x2e0x730x330x32,
    0x200x090x250x720x330x2c0x200x250x720x310x2c0x20,
    0x250x720x390x3b0x0a0x090x630x760x740x610x2e0x74,
    0x6f0x2e0x670x6c0x6f0x620x610x6c0x2e0x750x360x34,
    0x200x090x250x720x640x310x2c0x200x250x720x640x34,
    0x3b0x0a0x090x630x760x740x610x2e0x740x6f0x2e0x67,
    0x6c0x6f0x620x610x6c0x2e0x750x360x340x200x090x25,
    0x720x640x320x2c0x200x250x720x640x350x3b0x0a0x09,
    0x630x760x740x610x2e0x740x6f0x2e0x670x6c0x6f0x62,
    0x610x6c0x2e0x750x360x340x200x090x250x720x640x33,
    0x2c0x200x250x720x640x360x3b0x0a0x0a0x240x4c0x5f,
    0x5f0x420x420x300x5f0x320x3a0x0a0x090x6d0x750x6c,
    0x2e0x770x690x640x650x2e0x750x330x320x200x090x25,
    0x720x640x370x2c0x200x250x720x310x330x2c0x200x34,
    0x3b0x0a0x090x610x640x640x2e0x730x360x340x200x09,
    0x250x720x640x380x2c0x200x250x720x640x310x2c0x20,
    0x250x720x640x370x3b0x0a0x090x610x640x640x2e0x73,
    0x360x340x200x090x250x720x640x390x2c0x200x250x72,
    0x640x320x2c0x200x250x720x640x370x3b0x0a0x090x6c,
    0x640x2e0x670x6c0x6f0x620x610x6c0x2e0x750x330x32,
    0x200x090x250x720x310x300x2c0x200x5b0x250x720x64,
    0x390x5d0x3b0x0a0x090x6c0x640x2e0x670x6c0x6f0x62,
    0x610x6c0x2e0x750x330x320x200x090x250x720x310x31,
    0x2c0x200x5b0x250x720x640x380x5d0x3b0x0a0x090x61,
    0x640x640x2e0x730x330x320x200x090x250x720x310x32,
    0x2c0x200x250x720x310x300x2c0x200x250x720x310x31,
    0x3b0x0a0x090x610x640x640x2e0x730x360x340x200x09,
    0x250x720x640x310x300x2c0x200x250x720x640x330x2c,
    0x200x250x720x640x370x3b0x0a0x090x730x740x2e0x67,
    0x6c0x6f0x620x610x6c0x2e0x750x330x320x200x090x5b,
    0x250x720x640x310x300x5d0x2c0x200x250x720x310x32,
    0x3b0x0a0x090x610x640x640x2e0x730x330x320x200x09,
    0x250x720x310x330x2c0x200x250x720x310x330x2c0x20,
    0x250x720x330x3b0x0a0x090x730x650x740x700x2e0x6c,
    0x740x2e0x750x330x320x200x090x250x700x320x2c0x20,
    0x250x720x310x330x2c0x200x250x720x360x3b0x0a0x09,
    0x400x250x700x320x200x620x720x610x200x090x240x4c,
    0x5f0x5f0x420x420x300x5f0x320x3b0x0a0x0a0x240x4c,
    0x5f0x5f0x420x420x300x5f0x330x3a0x0a0x090x720x65,
    0x740x3b0x0a0x0a0x7d0x0a0x0a};

unsignedcharconst vector_add_int_cubin[] = {
    0x7f0x450x4c0x460x020x010x010x330x070x000x000x00,
    0x000x000x000x000x020x000xbe0x000x7c0x000x000x00,
    0x000x000x000x000x000x000x000x000x000x0b0x000x00,
    0x000x000x000x000x000x080x000x000x000x000x000x00,
    0x560x050x500x000x400x000x380x000x030x000x400x00,
    0x0c0x000x010x000x000x2e0x730x680x730x740x720x74,
    0x610x620x000x2e0x730x740x720x740x610x620x000x2e,
    0x730x790x6d0x740x610x620x000x2e0x730x790x6d0x74,
    0x610x620x5f0x730x680x6e0x640x780x000x2e0x6e0x76,
    0x2e0x690x6e0x660x6f0x000x2e0x740x650x780x740x2e,
    0x760x650x630x740x6f0x720x5f0x610x640x640x000x2e,
    0x6e0x760x2e0x690x6e0x660x6f0x2e0x760x650x630x74,
    0x6f0x720x5f0x610x640x640x000x2e0x6e0x760x2e0x73,
    0x680x610x720x650x640x2e0x760x650x630x740x6f0x72,
    0x5f0x610x640x640x000x2e0x6e0x760x2e0x630x6f0x6e,
    0x730x740x610x6e0x740x300x2e0x760x650x630x740x6f,
    0x720x5f0x610x640x640x000x2e0x720x650x6c0x2e0x6e,
    0x760x2e0x630x6f0x6e0x730x740x610x6e0x740x300x2e,
    0x760x650x630x740x6f0x720x5f0x610x640x640x000x2e,
    0x640x650x620x750x670x5f0x660x720x610x6d0x650x00,
    0x2e0x720x650x6c0x2e0x640x650x620x750x670x5f0x66,
    0x720x610x6d0x650x000x2e0x720x650x6c0x610x2e0x64,
    0x650x620x750x670x5f0x660x720x610x6d0x650x000x2e,
    0x6e0x760x2e0x630x610x6c0x6c0x670x720x610x700x68,
    0x000x2e0x6e0x760x2e0x700x720x6f0x740x6f0x740x79,
    0x700x650x000x2e0x6e0x760x2e0x720x650x6c0x2e0x61,
    0x630x740x690x6f0x6e0x000x000x2e0x730x680x730x74,
    0x720x740x610x620x000x2e0x730x740x720x740x610x62,
    0x000x2e0x730x790x6d0x740x610x620x000x2e0x730x79,
    0x6d0x740x610x620x5f0x730x680x6e0x640x780x000x2e,
    0x6e0x760x2e0x690x6e0x660x6f0x000x2e0x740x650x78,
    0x740x2e0x760x650x630x740x6f0x720x5f0x610x640x64,
    0x000x2e0x6e0x760x2e0x690x6e0x660x6f0x2e0x760x65,
    0x630x740x6f0x720x5f0x610x640x640x000x2e0x6e0x76,
    0x2e0x730x680x610x720x650x640x2e0x760x650x630x74,
    0x6f0x720x5f0x610x640x640x000x2e0x720x650x6c0x2e,
    0x6e0x760x2e0x630x6f0x6e0x730x740x610x6e0x740x30,
    0x2e0x760x650x630x740x6f0x720x5f0x610x640x640x00,
    0x2e0x6e0x760x2e0x630x6f0x6e0x730x740x610x6e0x74,
    0x300x2e0x760x650x630x740x6f0x720x5f0x610x640x64,
    0x000x2e0x640x650x620x750x670x5f0x660x720x610x6d,
    0x650x000x2e0x720x650x6c0x2e0x640x650x620x750x67,
    0x5f0x660x720x610x6d0x650x000x2e0x720x650x6c0x61,
    0x2e0x640x650x620x750x670x5f0x660x720x610x6d0x65,
    0x000x2e0x6e0x760x2e0x630x610x6c0x6c0x670x720x61,
    0x700x680x000x2e0x6e0x760x2e0x700x720x6f0x740x6f,
    0x740x790x700x650x000x2e0x6e0x760x2e0x720x650x6c,
    0x2e0x610x630x740x690x6f0x6e0x000x760x650x630x74,
    0x6f0x720x5f0x610x640x640x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x320x000x000x00,
    0x030x000x0b0x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x8a0x000x000x00,
    0x030x000x0a0x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xa30x000x000x00,
    0x030x000x040x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xd30x000x000x00,
    0x030x000x070x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xef0x000x000x00,
    0x030x000x080x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xfe0x000x000x00,
    0x120x100x0b0x000x000x000x000x000x000x000x000x00,
    0x000x020x000x000x000x000x000x000xff0xff0xff0xff,
    0x240x000x000x000x000x000x000x000xff0xff0xff0xff,
    0xff0xff0xff0xff0x030x000x040x7c0xff0xff0xff0xff,
    0x0f0x0c0x810x800x800x280x000x080xff0x810x800x28,
    0x080x810x800x800x280x000x000x000xff0xff0xff0xff,
    0x340x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x020x000x000x000x000x000x000x040x040x000x00,
    0x000x040x140x000x000x000x0c0x810x800x800x280x00,
    0x040x340x000x000x000x000x000x000x000x000x000x00,
    0x040x2f0x080x000x060x000x000x000x0e0x000x000x00,
    0x040x120x080x000x060x000x000x000x000x000x000x00,
    0x040x110x080x000x060x000x000x000x000x000x000x00,
    0x040x120x080x000x060x000x000x000x000x000x000x00,
    0x040x370x040x000x7c0x000x000x000x010x350x000x00,
    0x040x0a0x080x000x020x000x000x000x600x010x1c0x00,
    0x030x190x1c0x000x040x170x0c0x000x000x000x000x00,
    0x030x000x180x000x000xf00x110x000x040x170x0c0x00,
    0x000x000x000x000x020x000x100x000x000xf00x210x00,
    0x040x170x0c0x000x000x000x000x000x010x000x080x00,
    0x000xf00x210x000x040x170x0c0x000x000x000x000x00,
    0x000x000x000x000x000xf00x210x000x030x1b0xff0x00,
    0x040x1c0x080x000x500x000x000x000x300x010x000x00,
    0x040x1e0x040x000x000x000x000x000x000x000x000x00,
    0xff0xff0xff0xff0x000x000x000x000xfe0xff0xff0xff,
    0x000x000x000x000xfd0xff0xff0xff0x000x000x000x00,
    0xfc0xff0xff0xff0x000x000x000x000x730x000x000x00,
    0x000x000x000x000x000x000x000x110x250x000x050x36,
    0x440x000x000x000x000x000x000x000x020x000x000x00,
    0x060x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x020x7a0x010x000x000x0a0x000x000x000x0f0x000x00,
    0x000xe40x0f0x000x190x790x000x000x000x000x000x00,
    0x000x250x000x000x000x280x0e0x000x190x790x030x00,
    0x000x000x000x000x000x210x000x000x000x240x0e0x00,
    0x240x7a0x000x000x000x000x000x000x030x020x8e0x07,
    0x000xca0x1f0x000x0c0x7a0x000x000x000x5e0x000x00,
    0x700x600xf00x030x000xda0x0f0x000x4d0x090x000x00,
    0x000x000x000x000x000x000x800x030x000xea0x0f0x00,
    0xb90x7a0x040x000x000x460x000x000x000x0a0x000x00,
    0x000xe40x0f0x000x020x780x070x000x040x000x000x00,
    0x000x0f0x000x000x000xca0x1f0x000x250x760x020x00,
    0x000x580x000x000x070x000x8e0x070x000xc80x0f0x00,
    0x250x760x040x000x000x5a0x000x000x070x000x8e0x07,
    0x000xe40x0f0x0c0x810x790x030x020x040x000x000x00,
    0x000x190x1e0x0c0x000xa80x0e0x000x810x790x040x04,
    0x040x000x000x000x000x190x1e0x0c0x000xa20x0e0x00,
    0x250x760x060x000x000x5c0x000x000x070x000x8e0x07,
    0x000xe20x0f0x000x020x7a0x0b0x000x000x000x000x00,
    0x000x0f0x000x000x000xca0x0f0x000x240x7a0x000x0b,
    0x000x030x000x000x000x020x8e0x070x000xca0x0f0x00,
    0x0c0x7a0x000x000x000x5e0x000x000x700x600xf00x03,
    0x000xe40x0f0x000x100x720x090x040x030x000x000x00,
    0xff0xe00xff0x070x000xca0x4f0x000x860x790x000x06,
    0x090x000x000x000x040x190x100x0c0x000xec0x010x00,
    0x470x890x000x000x400xff0xff0xff0xff0xff0x830x03,
    0x000xea0x0f0x000x4d0x790x000x000x000x000x000x00,
    0x000x000x800x030x000xea0x0f0x000x470x790x000x00,
    0xf00xff0xff0xff0xff0xff0x830x030x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x010x000x000x000x030x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x400x000x000x000x000x000x000x000xfe0x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x010x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x0b0x000x000x000x030x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x3e0x010x000x000x000x000x000x00,
    0x090x010x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x010x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x130x000x000x00,
    0x020x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x480x020x000x00,
    0x000x000x000x000xa80x000x000x000x000x000x000x00,
    0x020x000x000x000x060x000x000x000x080x000x000x00,
    0x000x000x000x000x180x000x000x000x000x000x000x00,
    0xa30x000x000x000x010x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xf00x020x000x000x000x000x000x000x700x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x010x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x290x000x000x000x000x000x000x70,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x600x030x000x000x000x000x000x00,
    0x300x000x000x000x000x000x000x000x030x000x000x00,
    0x000x000x000x000x040x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x430x000x000x00,
    0x000x000x000x700x400x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x900x030x000x00,
    0x000x000x000x000x740x000x000x000x000x000x000x00,
    0x030x000x000x000x0b0x000x000x000x040x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xd30x000x000x000x010x000x000x700x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x040x040x000x000x000x000x000x000x200x000x000x00,
    0x000x000x000x000x030x000x000x000x000x000x000x00,
    0x040x000x000x000x000x000x000x000x080x000x000x00,
    0x000x000x000x000xef0x000x000x000x0b0x000x000x70,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x280x040x000x000x000x000x000x00,
    0x100x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x080x000x000x000x000x000x000x00,
    0x080x000x000x000x000x000x000x000xb00x000x000x00,
    0x090x000x000x000x400x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x380x040x000x00,
    0x000x000x000x000x100x000x000x000x000x000x000x00,
    0x030x000x000x000x040x000x000x000x080x000x000x00,
    0x000x000x000x000x100x000x000x000x000x000x000x00,
    0x6d0x000x000x000x010x000x000x000x420x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x480x040x000x000x000x000x000x000x7c0x010x000x00,
    0x000x000x000x000x000x000x000x000x0b0x000x000x00,
    0x040x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x320x000x000x000x010x000x000x00,
    0x060x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x060x000x000x000x000x000x00,
    0x000x020x000x000x000x000x000x000x030x000x000x00,
    0x060x000x000x0e0x800x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x060x000x000x00,
    0x050x000x000x000x000x0b0x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000xa80x000x000x000x000x000x000x00,
    0xa80x000x000x000x000x000x000x000x080x000x000x00,
    0x000x000x000x000x010x000x000x000x050x000x000x00,
    0x480x040x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xb80x030x000x000x000x000x000x000xb80x030x000x00,
    0x000x000x000x000x080x000x000x000x000x000x000x00,
    0x010x000x000x000x050x000x000x000x000x0b0x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xa80x000x000x00,
    0x000x000x000x000xa80x000x000x000x000x000x000x00,
    0x080x000x000x000x000x000x000x00};

unsignedchar vector_add_int_fatbin[] = {
    0x500xed0x550xba0x010x000x100x000xf00x0b0x000x00,
    0x000x000x000x000x020x000x010x010x480x000x000x00,
    0xa80x0b0x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x070x000x010x000x560x000x000x00,
    0x000x000x000x000x000x000x000x000x110x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x7f0x450x4c0x460x020x010x010x33,
    0x070x000x000x000x000x000x000x000x020x000xbe0x00,
    0x7c0x000x000x000x000x000x000x000x000x000x000x00,
    0x000x0b0x000x000x000x000x000x000x000x080x000x00,
    0x000x000x000x000x560x050x500x000x400x000x380x00,
    0x030x000x400x000x0c0x000x010x000x000x2e0x730x68,
    0x730x740x720x740x610x620x000x2e0x730x740x720x74,
    0x610x620x000x2e0x730x790x6d0x740x610x620x000x2e,
    0x730x790x6d0x740x610x620x5f0x730x680x6e0x640x78,
    0x000x2e0x6e0x760x2e0x690x6e0x660x6f0x000x2e0x74,
    0x650x780x740x2e0x760x650x630x740x6f0x720x5f0x61,
    0x640x640x000x2e0x6e0x760x2e0x690x6e0x660x6f0x2e,
    0x760x650x630x740x6f0x720x5f0x610x640x640x000x2e,
    0x6e0x760x2e0x730x680x610x720x650x640x2e0x760x65,
    0x630x740x6f0x720x5f0x610x640x640x000x2e0x6e0x76,
    0x2e0x630x6f0x6e0x730x740x610x6e0x740x300x2e0x76,
    0x650x630x740x6f0x720x5f0x610x640x640x000x2e0x72,
    0x650x6c0x2e0x6e0x760x2e0x630x6f0x6e0x730x740x61,
    0x6e0x740x300x2e0x760x650x630x740x6f0x720x5f0x61,
    0x640x640x000x2e0x640x650x620x750x670x5f0x660x72,
    0x610x6d0x650x000x2e0x720x650x6c0x2e0x640x650x62,
    0x750x670x5f0x660x720x610x6d0x650x000x2e0x720x65,
    0x6c0x610x2e0x640x650x620x750x670x5f0x660x720x61,
    0x6d0x650x000x2e0x6e0x760x2e0x630x610x6c0x6c0x67,
    0x720x610x700x680x000x2e0x6e0x760x2e0x700x720x6f,
    0x740x6f0x740x790x700x650x000x2e0x6e0x760x2e0x72,
    0x650x6c0x2e0x610x630x740x690x6f0x6e0x000x000x2e,
    0x730x680x730x740x720x740x610x620x000x2e0x730x74,
    0x720x740x610x620x000x2e0x730x790x6d0x740x610x62,
    0x000x2e0x730x790x6d0x740x610x620x5f0x730x680x6e,
    0x640x780x000x2e0x6e0x760x2e0x690x6e0x660x6f0x00,
    0x2e0x740x650x780x740x2e0x760x650x630x740x6f0x72,
    0x5f0x610x640x640x000x2e0x6e0x760x2e0x690x6e0x66,
    0x6f0x2e0x760x650x630x740x6f0x720x5f0x610x640x64,
    0x000x2e0x6e0x760x2e0x730x680x610x720x650x640x2e,
    0x760x650x630x740x6f0x720x5f0x610x640x640x000x2e,
    0x720x650x6c0x2e0x6e0x760x2e0x630x6f0x6e0x730x74,
    0x610x6e0x740x300x2e0x760x650x630x740x6f0x720x5f,
    0x610x640x640x000x2e0x6e0x760x2e0x630x6f0x6e0x73,
    0x740x610x6e0x740x300x2e0x760x650x630x740x6f0x72,
    0x5f0x610x640x640x000x2e0x640x650x620x750x670x5f,
    0x660x720x610x6d0x650x000x2e0x720x650x6c0x2e0x64,
    0x650x620x750x670x5f0x660x720x610x6d0x650x000x2e,
    0x720x650x6c0x610x2e0x640x650x620x750x670x5f0x66,
    0x720x610x6d0x650x000x2e0x6e0x760x2e0x630x610x6c,
    0x6c0x670x720x610x700x680x000x2e0x6e0x760x2e0x70,
    0x720x6f0x740x6f0x740x790x700x650x000x2e0x6e0x76,
    0x2e0x720x650x6c0x2e0x610x630x740x690x6f0x6e0x00,
    0x760x650x630x740x6f0x720x5f0x610x640x640x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x320x000x000x000x030x000x0b0x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x8a0x000x000x000x030x000x0a0x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xa30x000x000x000x030x000x040x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xd30x000x000x000x030x000x070x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xef0x000x000x000x030x000x080x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xfe0x000x000x000x120x100x0b0x000x000x000x000x00,
    0x000x000x000x000x000x020x000x000x000x000x000x00,
    0xff0xff0xff0xff0x240x000x000x000x000x000x000x00,
    0xff0xff0xff0xff0xff0xff0xff0xff0x030x000x040x7c,
    0xff0xff0xff0xff0x0f0x0c0x810x800x800x280x000x08,
    0xff0x810x800x280x080x810x800x800x280x000x000x00,
    0xff0xff0xff0xff0x340x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x020x000x000x000x000x000x00,
    0x040x040x000x000x000x040x140x000x000x000x0c0x81,
    0x800x800x280x000x040x340x000x000x000x000x000x00,
    0x000x000x000x000x040x2f0x080x000x060x000x000x00,
    0x0e0x000x000x000x040x120x080x000x060x000x000x00,
    0x000x000x000x000x040x110x080x000x060x000x000x00,
    0x000x000x000x000x040x120x080x000x060x000x000x00,
    0x000x000x000x000x040x370x040x000x7c0x000x000x00,
    0x010x350x000x000x040x0a0x080x000x020x000x000x00,
    0x600x010x1c0x000x030x190x1c0x000x040x170x0c0x00,
    0x000x000x000x000x030x000x180x000x000xf00x110x00,
    0x040x170x0c0x000x000x000x000x000x020x000x100x00,
    0x000xf00x210x000x040x170x0c0x000x000x000x000x00,
    0x010x000x080x000x000xf00x210x000x040x170x0c0x00,
    0x000x000x000x000x000x000x000x000x000xf00x210x00,
    0x030x1b0xff0x000x040x1c0x080x000x500x000x000x00,
    0x300x010x000x000x040x1e0x040x000x000x000x000x00,
    0x000x000x000x000xff0xff0xff0xff0x000x000x000x00,
    0xfe0xff0xff0xff0x000x000x000x000xfd0xff0xff0xff,
    0x000x000x000x000xfc0xff0xff0xff0x000x000x000x00,
    0x730x000x000x000x000x000x000x000x000x000x000x11,
    0x250x000x050x360x440x000x000x000x000x000x000x00,
    0x020x000x000x000x060x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x020x7a0x010x000x000x0a0x000x00,
    0x000x0f0x000x000x000xe40x0f0x000x190x790x000x00,
    0x000x000x000x000x000x250x000x000x000x280x0e0x00,
    0x190x790x030x000x000x000x000x000x000x210x000x00,
    0x000x240x0e0x000x240x7a0x000x000x000x000x000x00,
    0x030x020x8e0x070x000xca0x1f0x000x0c0x7a0x000x00,
    0x000x5e0x000x000x700x600xf00x030x000xda0x0f0x00,
    0x4d0x090x000x000x000x000x000x000x000x000x800x03,
    0x000xea0x0f0x000xb90x7a0x040x000x000x460x000x00,
    0x000x0a0x000x000x000xe40x0f0x000x020x780x070x00,
    0x040x000x000x000x000x0f0x000x000x000xca0x1f0x00,
    0x250x760x020x000x000x580x000x000x070x000x8e0x07,
    0x000xc80x0f0x000x250x760x040x000x000x5a0x000x00,
    0x070x000x8e0x070x000xe40x0f0x0c0x810x790x030x02,
    0x040x000x000x000x000x190x1e0x0c0x000xa80x0e0x00,
    0x810x790x040x040x040x000x000x000x000x190x1e0x0c,
    0x000xa20x0e0x000x250x760x060x000x000x5c0x000x00,
    0x070x000x8e0x070x000xe20x0f0x000x020x7a0x0b0x00,
    0x000x000x000x000x000x0f0x000x000x000xca0x0f0x00,
    0x240x7a0x000x0b0x000x030x000x000x000x020x8e0x07,
    0x000xca0x0f0x000x0c0x7a0x000x000x000x5e0x000x00,
    0x700x600xf00x030x000xe40x0f0x000x100x720x090x04,
    0x030x000x000x000xff0xe00xff0x070x000xca0x4f0x00,
    0x860x790x000x060x090x000x000x000x040x190x100x0c,
    0x000xec0x010x000x470x890x000x000x400xff0xff0xff,
    0xff0xff0x830x030x000xea0x0f0x000x4d0x790x000x00,
    0x000x000x000x000x000x000x800x030x000xea0x0f0x00,
    0x470x790x000x000xf00xff0xff0xff0xff0xff0x830x03,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x180x790x000x000x000x000x000x000x000x000x000x00,
    0x000xc00x0f0x000x180x790x000x000x000x000x000x00,
    0x000x000x000x000x000xc00x0f0x000x180x790x000x00,
    0x000x000x000x000x000x000x000x000x000xc00x0f0x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x010x000x000x000x030x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x400x000x000x000x000x000x000x00,
    0xfe0x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x010x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x0b0x000x000x00,
    0x030x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x3e0x010x000x00,
    0x000x000x000x000x090x010x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x010x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x130x000x000x000x020x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x480x020x000x000x000x000x000x000xa80x000x000x00,
    0x000x000x000x000x020x000x000x000x060x000x000x00,
    0x080x000x000x000x000x000x000x000x180x000x000x00,
    0x000x000x000x000xa30x000x000x000x010x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000xf00x020x000x000x000x000x000x00,
    0x700x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x010x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x290x000x000x00,
    0x000x000x000x700x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x600x030x000x00,
    0x000x000x000x000x300x000x000x000x000x000x000x00,
    0x030x000x000x000x000x000x000x000x040x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x430x000x000x000x000x000x000x700x400x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x900x030x000x000x000x000x000x000x740x000x000x00,
    0x000x000x000x000x030x000x000x000x0b0x000x000x00,
    0x040x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000xd30x000x000x000x010x000x000x70,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x040x040x000x000x000x000x000x00,
    0x200x000x000x000x000x000x000x000x030x000x000x00,
    0x000x000x000x000x040x000x000x000x000x000x000x00,
    0x080x000x000x000x000x000x000x000xef0x000x000x00,
    0x0b0x000x000x700x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x280x040x000x00,
    0x000x000x000x000x100x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x080x000x000x00,
    0x000x000x000x000x080x000x000x000x000x000x000x00,
    0xb00x000x000x000x090x000x000x000x400x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x380x040x000x000x000x000x000x000x100x000x000x00,
    0x000x000x000x000x030x000x000x000x040x000x000x00,
    0x080x000x000x000x000x000x000x000x100x000x000x00,
    0x000x000x000x000x6d0x000x000x000x010x000x000x00,
    0x420x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x480x040x000x000x000x000x000x00,
    0x7c0x010x000x000x000x000x000x000x000x000x000x00,
    0x0b0x000x000x000x040x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x320x000x000x00,
    0x010x000x000x000x060x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x060x000x00,
    0x000x000x000x000x000x020x000x000x000x000x000x00,
    0x030x000x000x000x060x000x000x0e0x800x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x060x000x000x000x050x000x000x000x000x0b0x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000xa80x000x000x00,
    0x000x000x000x000xa80x000x000x000x000x000x000x00,
    0x080x000x000x000x000x000x000x000x010x000x000x00,
    0x050x000x000x000x480x040x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000xb80x030x000x000x000x000x000x00,
    0xb80x030x000x000x000x000x000x000x080x000x000x00,
    0x000x000x000x000x010x000x000x000x050x000x000x00,
    0x000x0b0x000x000x000x000x000x000x000x000x000x00,
    0x000x000x000x000x000x000x000x000x000x000x000x00,
    0xa80x000x000x000x000x000x000x000xa80x000x000x00,
    0x000x000x000x000x080x000x000x000x000x000x000x00};

最后,这些PTX、CUBIN和FATBIN字节码可以使用cuModuleLoadData函数加载到CUDA driver API中,类似于我们之前使用的cuModuleLoad函数。

// Build command
// g++ run_vector_add_sm86.cpp vector_add_ptx_cubin_fatbin_sm86.cpp
// -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -o
// run_vector_add_sm86

// Using CUDA driver API is sufficient.
#include <cuda.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#include "vector_add_ptx_cubin_fatbin_sm86.hpp"

#define CHECK_CUDA_DRIVER_ERROR(val)                                           \
    check_driver((val), #val, __FILE__, __LINE__)

void check_driver(CUresult err, char const* func, char const* file, int line)
{
    if (err != CUDA_SUCCESS)
    {
        std::cerr << "CUDA Driver Error at: " << file << ":" << line
                  << std::endl;
        charconst* err_string{nullptr};
        std::cerr << cuGetErrorString(err, &err_string) << std::endl;
        std::cerr << err_string << " " << func << std::endl;
        std::exit(EXIT_FAILURE);
    }
}

int main(int argc, char** argv)
{
    // 用户可以指定是否使用PTX、CUBIN或FATBIN字节码
    if (argc != 2)
    {
        std::cerr << "Usage: " << argv[0] << " ptx|cubin|fatbin" << std::endl;
        return1;
    }

    // 声明CUDA driver API所需的各种句柄和对象
    CUdevice cuda_device{0};        // 使用CUDA设备0
    CUfunction vector_add_kernel{}; // CUDA kernel 函数句柄
    CUmodule cuda_module{};         // CUDA模块句柄
    CUcontext cuda_context{};       // CUDA上下文句柄
    CUstream cuda_stream{};         // CUDA流句柄

    // 初始化CUDA driver API,目前此标志必须为0
    unsignedintconst cuda_driver_init_flags{0};
    CHECK_CUDA_DRIVER_ERROR(cuInit(cuda_driver_init_flags));
    
    // 创建CUDA上下文,目前此标志必须为0
    unsignedintconst cuda_context_init_flags{0};
    CHECK_CUDA_DRIVER_ERROR(
        cuCtxCreate(&cuda_context, cuda_context_init_flags, cuda_device));
    
    // 创建非阻塞CUDA流
    CHECK_CUDA_DRIVER_ERROR(
        cuStreamCreate(&cuda_stream, CU_STREAM_NON_BLOCKING));

    // 根据命令行参数从不同格式的字节码创建CUDA模块
    if (std::string{argv[1]} == "ptx")
    {
        // 从PTX字节码加载模块
        CHECK_CUDA_DRIVER_ERROR(cuModuleLoadData(
            &cuda_module, static_cast<voidconst*>(vector_add_int_ptx)));
    }
    elseif (std::string{argv[1]} == "cubin")
    {
        // 从CUBIN字节码加载模块
        CHECK_CUDA_DRIVER_ERROR(cuModuleLoadData(
            &cuda_module, static_cast<voidconst*>(vector_add_int_cubin)));
    }
    elseif (std::string{argv[1]} == "fatbin")
    {
        // 从FATBIN字节码加载模块
        CHECK_CUDA_DRIVER_ERROR(cuModuleLoadData(
            &cuda_module, static_cast<voidconst*>(vector_add_int_fatbin)));
    }
    else
    {
        std::cerr << "Error: The byte type is not supported." << std::endl;
        return1;
    }

    // 从CUDA模块中获取名为"vector_add"的 kernel 函数
    CHECK_CUDA_DRIVER_ERROR(
        cuModuleGetFunction(&vector_add_kernel, cuda_module, "vector_add"));

    // 设置向量元素数量
    unsignedint num_elements{8192};

    // 在主机端创建输入和输出向量
    std::vector<inthost_vector_a(num_elements, 1);          // 输入向量A
    std::vector<inthost_vector_b(num_elements, 2);          // 输入向量B
    std::vector<inthost_vector_c(num_elements, -1);         // 输出向量C
    std::vector<inthost_vector_c_reference(num_elements, -2)// 参考结果向量
    
    // 初始化输入向量,设置为索引值
    for (size_t i{0}; i < num_elements; ++i)
    {
        host_vector_a.at(i) = i;
        host_vector_b.at(i) = i;
    }
    
    // 计算参考结果,用于验证 kernel 计算的正确性
    for (size_t i{0}; i < num_elements; ++i)
    {
        host_vector_c_reference.at(i) =
            host_vector_a.at(i) + host_vector_b.at(i);
    }

    // 在设备端分配内存
    CUdeviceptr device_vector_a{};
    CUdeviceptr device_vector_b{};
    CUdeviceptr device_vector_c{};
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_a, num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_b, num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(
        cuMemAlloc(&device_vector_c, num_elements * sizeof(int)));

    // 将输入向量从主机复制到设备
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyHtoD(device_vector_a, host_vector_a.data(),
                                         num_elements * sizeof(int)));
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyHtoD(device_vector_b, host_vector_b.data(),
                                         num_elements * sizeof(int)));

    // 设置 kernel 参数数组,包含设备内存指针和元素数量
    void* kernel_params[]{&device_vector_a, &device_vector_b, &device_vector_c,
                          &num_elements};
    
    // 配置 kernel 启动参数
    unsignedintconst block_size_x{256};  // 线程块大小:256个线程
    unsignedintconst block_size_y{1};
    unsignedintconst block_size_z{1};
    // 计算网格大小,确保覆盖所有元素
    unsignedintconst grid_size_x{(num_elements + block_size_x - 1) /
                                   block_size_x};
    unsignedintconst grid_size_y{1};
    unsignedintconst grid_size_z{1};
    unsignedintconst shared_memory_size{0}; // 不使用共享内存
    
    // 启动CUDA kernel 
    CHECK_CUDA_DRIVER_ERROR(cuLaunchKernel(
        vector_add_kernel, grid_size_x, grid_size_y, grid_size_z, block_size_x,
        block_size_y, block_size_z, shared_memory_size, cuda_stream,
        kernel_params, nullptr));
    
    // 等待 kernel 执行完成
    CHECK_CUDA_DRIVER_ERROR(cuStreamSynchronize(cuda_stream));

    // 将结果向量从设备复制回主机
    CHECK_CUDA_DRIVER_ERROR(cuMemcpyDtoH(host_vector_c.data(), device_vector_c,
                                         num_elements * sizeof(int)));

    // 验证计算结果的正确性
    for (size_t i{0}; i < num_elements; ++i)
    {
        if (host_vector_c.at(i) != host_vector_c_reference.at(i))
        {
            std::cerr << "Error: The result is incorrect." << std::endl;
            return1;
        }
    }

    // 清理资源:释放设备内存
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_a));
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_b));
    CHECK_CUDA_DRIVER_ERROR(cuMemFree(device_vector_c));

    // 销毁CUDA流
    CHECK_CUDA_DRIVER_ERROR(cuStreamDestroy(cuda_stream));

    // 卸载CUDA模块
    CHECK_CUDA_DRIVER_ERROR(cuModuleUnload(cuda_module));

    // 销毁CUDA上下文
    CHECK_CUDA_DRIVER_ERROR(cuCtxDestroy(cuda_context));

    return0;
}

要构建程序,请运行以下命令。

$ g++ run_vector_add_sm86.cpp vector_add_ptx_cubin_fatbin_sm86.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -o run_vector_add_sm86

要运行程序,请运行以下命令。应该不会遇到错误。

$ ./run_vector_add_sm86 ptx
$ ./run_vector_add_sm86 cubin
$ ./run_vector_add_sm86 fatbin

参考

  • CUDA Compilation(https://leimao.github.io/blog/CUDA-Compilation/)
  • Simple Driver Runtime – CUDA Examples(https://github.com/NVIDIA/cuda-samples/tree/9c688d7ff78455ed42e345124d1495aad6bf66de/Samples/0_Introduction/simpleDrvRuntime)



为了感谢读者的长期支持,今天我们将送出三本由 清华大学出版社 提供的:《scikit-learn机器学习超入门 算法原理与实践》。点击下方抽奖助手参与抽奖。没抽到的小伙伴可以使用下方链接购买。
《scikit-learn机器学习超入门 算法原理与实践》抽奖链接

(文:GiantPandaCV)

发表评论