Opencl减少不符合预期

苏拉利奥

我是有关opencl的新手。我已经尝试过“获取数组中每个元素的所有多维数据集的总和”。这是我的内核代码:

kernel void cubeSum(global float *input,
                    local float *prods,
                    global float *output )
{
    int gid = get_global_id( 0 );
    int tnum = get_local_id( 0 ); // thread number
    int wgNum = get_group_id( 0 ); // work-group number
    int numItems = get_local_size( 0 );
    prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube

    for (int offset = 1; offset < numItems; offset *= 2) {
        int mask = 2 * offset - 1;
        barrier(CLK_LOCAL_MEM_FENCE);
        if ( (tnum & mask) == 0 ) {
            prods[tnum] += prods[tnum + offset];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if ( tnum == 0 )
        output[wgNum] = prods[0];
}

我无法弄清楚为什么我的结果与顺序结果不一样。当数组从0到511时,我的结果是顺序结果减去2048;当数组从0到1023时,我的结果是顺序结果加16384。

在等待您回答时,我会尝试自己解决问题。

另一个问题是我发现很难调试内核代码,因为数据集很大并且可以同时运行。有调试建议吗?

感谢所有建议=)。

顺便说一下,这是我的主机代码:

#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>

#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)

int main(int argc, const char * argv[])
{
    float data[NUM_ELEMENTS];           //hA
    float sum;
    float sumTest;

    size_t global;
    size_t local;
    size_t numWorkGroups;
    size_t dataSize;
    size_t resultsSize;

    cl_device_id device;
    cl_context context;
    cl_command_queue cmdQueue;
    cl_program program;
    cl_kernel kernel;

    cl_mem input;
    cl_mem output;

    FILE *fp;
    //failed to use relative path here. permission problem?
    char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
    char *source_str;
    size_t source_size;

    /* カーネルを含むソースコードをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //allocate the host memory buffers:
    int i = 0;
    unsigned int count = NUM_ELEMENTS;
    for (i = 0; i < count; i++) {
        data[i] = i;
    }

    //array size in bytes (will need this later):
    dataSize = NUM_ELEMENTS * sizeof(float);

    //opencl function status
    cl_int status;

    // Connect to a compute device
    //
    int gpu = 1;

    status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (status != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    //create an Opencl context
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);

    //create a command queue
    cmdQueue = clCreateCommandQueue( context, device, 0, &status );

    //allocate memory buffers on the device
    input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status );   //dA

    //TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
    //need to figure out a way to avoid hardcode it.
    output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC

    // enqueue the 2 commands to write data into the device buffers:
    status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );

    // create the kernel program on the device:
    program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }


    // Build the program executable
    //
    status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (status != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    //create compute kernel
    kernel = clCreateKernel( program, "cubeSum", &status );

    // Get the maximum work group size for executing the kernel on the device
    //
    status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (status != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", status);
        exit(1);
    }

    global = count;

    numWorkGroups = global / local;
    float results[numWorkGroups];        //hC
    resultsSize = numWorkGroups * sizeof(float);

    //set kernel parameter
    status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
    status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
    status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (status)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    clFinish(cmdQueue);
    status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );

    // Validate our results
    //
    sum = 0;

    for (int i=0; i<numWorkGroups; i++) {
        sum += results[i];
    }

    sumTest = 0;
    for(i = 0; i < count; i++)
    {
        sumTest += data[i] * data[i] * data[i];
    }

    // Print a brief summary detailing the results
    //
    printf("Computed '%f/%f'!\n", sum, sumTest);

    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(cmdQueue);
    clReleaseContext(context);

    return 0;

}

编辑:刚发现另一件事。如果我仅对所有不带立方体/正方形的元素求和,我的代码是正确的。因此,我要弄清楚多维数据集如何影响我的程序。

价格

您似乎只分配了4个字节的本地内存:

status = clSetKernelArg( kernel, 1, sizeof(float), NULL );

这应该是整个工作组对该参数所需的本地内存总量。对于您的内核,这是(work-group-size * sizeof(float))

因此,您应该改成这样:

status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );

您看到的差异可能是由于浮点数的限制,因为您将一些非常大的数字相加。如果使用较小的数字(例如data[i] = i*0.01;初始化输入,则应获得与顺序实现相同的结果(我已经在自己的系统上进行了验证)。这就是为什么在删除多维数据集时看不到错误的原因。

本文收集自互联网,转载请注明来源。

如有侵权,请联系[email protected] 删除。

编辑于
0

我来说两句

0条评论
登录后参与评论

相关文章