在设备函数 CUDA 中通过引用传递

debugcn 发表于 Dev

道成

我想在CUDA中通过引用传递并在mex文件Matlab中传递函数，我调用了返回多变量的add_func，我通过指针传递了它，但是在执行代码时出现问题。请看一下我的代码并提前给我一些。

#include <stdio.h>
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"

typedef void(*op_func_t) (double, double, double*, double*);
typedef void(*my_func_t) (double, double, double, double, void (*func)(double, double, double *, double *));

__device__ void add_func(double x, double y, double *z, double *k)

{
    *z= x + y;
    *k= x + y;
}
__device__ void mul_func(double x, double y, double *z, double *k)

{   
    *z= x * y;
    *k= x * y;

}
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__device__ my_func_t p_my_func = my_func;


__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)

{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5)
        {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        }
        else   
        {
            v[i]=10;
            u[i]=8;
        }
    __syncthreads();// wait for each thread to copy its elemenet
}

//host code

void mexFunction(int nlhs,mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

    /* Declare all variables.*/
    mxGPUArray const *A;
    mxGPUArray const *C;
    mxGPUArray *B;
    mxGPUArray *D;
    double const *d_A;
    double const *d_C;
    double *d_B;
    double *d_D;
    int N;

    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    /* Initialize the MathWorks GPU API. */
    mxInitGPU();

    A = mxGPUCreateFromMxArray(prhs[0]);
    C = mxGPUCreateFromMxArray(prhs[1]);

    /*
     * Now that we have verified the data type, extract a pointer to the input
     * data on the device.
     */
    d_A = (double const *)(mxGPUGetDataReadOnly(A));
    d_C = (double const *)(mxGPUGetDataReadOnly(C));

    /* Create a GPUArray to hold the result and get its underlying pointer. */
    B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    D = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    d_B = (double *)(mxGPUGetData(B));
    d_D = (double *)(mxGPUGetData(D));

    /*
     * Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
     * and it would be possible for the number of elements to be too large for
     * the grid. For this example we are not guarding against this possibility.
     */
    N = (int)(mxGPUGetNumberOfElements(A));
    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;

    op_func_t h_mul_func;

    my_func_t h_my_func;


    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);

    /* Wrap the result up as a MATLAB gpuArray for return. */
    plhs[0] = mxGPUCreateMxArrayOnGPU(B);
    plhs[1] = mxGPUCreateMxArrayOnGPU(D);
    /*
     * The mxGPUArray pointers are host-side structures that refer to device
     * data. These must be destroyed before leaving the MEX function.
     */
    mxGPUDestroyGPUArray(A);
    mxGPUDestroyGPUArray(B);
    mxGPUDestroyGPUArray(C);
    mxGPUDestroyGPUArray(D);

    return;

}

这是我的结果：

[a,b]=return_values(gpuArray(ones(10)),gpuArray(rand(10)))

一 =

 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10

乙 =

 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8

一些初始化代码：

// Static pointers to device functions

    __device__ op_func_t p_add_func = add_func;

    __device__ op_func_t p_mul_func = mul_func;

    __device__ my_func_t p_my_func = my_func;

        op_func_t h_add_func;

        op_func_t h_mul_func;

        my_func_t h_my_func;


        // Copy device function pointer to host side
        cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

        op_func_t d_myfunc = h_mul_func;
        op_func_t d_myfunc1 = h_add_func;
        my_func_t d_myfunc2 = h_my_func;

看门人

这里有很多问题。这：

__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}

被打破。由于传值，z 和 k 的修改值永远不会返回给调用者。您可以通过使用引用来解决此问题（CUDA 基于 C++，并且在 __device__ 函数中支持引用）：

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}

第二个问题是内核中缺少边界检查，当N不是的精确倍数时，这将导致越界内存访问threadsPerBlock。

当我像这样在你的代码中修复这两件事时：

#include <thrust/device_vector.h>
#include <iostream>

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;

__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5) {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        } else if (i<N)   {
            v[i]=10;
            u[i]=8;
        }
}

//host code

int main()
{
    const size_t n = 5;
    const size_t N = n * n;

    /* Declare all variables.*/
    thrust::device_vector<double> A(N, 1.0);
    thrust::device_vector<double> C(N, 1.0);
    thrust::device_vector<double> B(N);
    thrust::device_vector<double> D(N);
    double *d_A = thrust::raw_pointer_cast(A.data()); 
    double *d_C = thrust::raw_pointer_cast(C.data());  
    double *d_B = thrust::raw_pointer_cast(B.data()); 
    double *d_D = thrust::raw_pointer_cast(D.data()); 


    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;
    op_func_t h_mul_func;
    my_func_t h_my_func;

    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
    cudaDeviceSynchronize();

    for(const auto bval: B)
        std::cout << bval << std::endl;

    for(const auto dval: D)
        std::cout << dval << std::endl;

    return 0;
}

据我所知，您代码的 CUDA 部分工作正常：

$ nvcc -o mexhead -std=c++11 -arch=sm_52 mexhead.cu
$ cuda-memcheck ./mexhead 
========= CUDA-MEMCHECK
2
2
2
2
2
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
2
2
2
2
2
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
========= ERROR SUMMARY: 0 errors

本文收集自互联网，转载请注明来源。

如有侵权，请联系[email protected] 删除。