移动OpenCL本地存储库冲突。为什么使用本地内存比内核中的全局内存要慢？

debugcn 发表于 Dev

李英万

我正在使用开发Android平台中的人脸检测应用程序OpenCL。人脸检测算法基于Viola Jones算法。我试图制作Cascade分类步骤内核代码。我将classifier data级联阶段1设置为级联阶段，local memory(__local)因为分类器数据用于所有工作项。

但是，不使用本地内存（使用全局内存）的内核分析时间要比使用本地内存的速度更快。

编辑：

我上传了完整的代码。

带有本地内存版本

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
       int cascadeLocalSize = get_local_size(0);

       __local float localS1F1[42];

       int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
       if(localIdx<42)
       {
           int stage1Idx = localIdx + idxNumValStageArray[0]+4;
           localS1F1[localIdx] = classifierMem[stage1Idx];
       }
       barrier(CLK_LOCAL_MEM_FENCE);


       float resizeFactor = 1.0;
       int2 im_dim = get_image_dim(input_image);
       unsigned int srcWidth = im_dim.x*(float)resizeFactor;
       unsigned int srcHeight = im_dim.y*(float)resizeFactor;

       int gx = get_global_id(0);
       int gy = get_global_id(1);

       int skinX=0;
       int skinY=0;
       int coordi=vecSkin[512*gy+gx];
       skinX = coordi%im_dim.x;
       skinY = coordi/im_dim.x;

       if( skinX >= 10 && skinY >= 10 )
       {
             skinX -= 10;
             skinY -= 10;
       }      

       int type = gx%3;

       unsigned int windowWidth = classifierMem[0];
       unsigned int windowHeight = classifierMem[1]; 


       unsigned int stageIndex;
       float stageThres;
       float numFeatures;
       unsigned int featureIndex;
       float featureValue;

       if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
             bool stagePass = true;
             unsigned int index = 0;
             for(unsigned int i=numTotStage; i>0;i--){
                    if(stagePass){
                           if(index == 0){
                                 stageIndex = idxNumValStageArray[0];                                 
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = 0;
                                 featureValue = 0.0;                           
                           }
                           else{
                                 stageIndex = idxNumValStageArray[index];
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = stageIndex+4;
                                 featureValue = 0.0;
                           }
                           float featureThres;
                           float succVal;
                           float failVal;
                           unsigned int numRegions;
                           float regionValue;


                           if(type ==0 && index==0)
                           {
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass) 
                                 }// end of for(unsigned int j=numFeatures; j>0;j--)

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));

                           }

                           else if(type ==1 && index ==0)
                           {
                                 featureIndex +=14;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                              if(j==1)
                                                     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;


                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           else if(index == 0)
                           {
                                 featureIndex +=28;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){

                                              if(j==2)     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }// end of for(unsigned int k=numRegions; k>0;k--)
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass)
                                 }//end of for(unsigned int j=numFeatures; j>0;j--)

                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           //stage 
                           else{
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=classifierMem[featureIndex++];
                                              failVal=classifierMem[featureIndex++];
                                              numRegions = classifierMem[featureIndex++];
                                              regionValue =0.0;
                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){
                                                     regionP.x=(int)(classifierMem[featureIndex])+skinX;
                                                     regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                                                     regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4]; 
                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }
                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }
                    }
             }      
       }else return;
}

原始版本（无本地内存）

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
    float resizeFactor = 1.0;

    int2 im_dim = get_image_dim(input_image);

    unsigned int srcWidth = im_dim.x*(float)resizeFactor;
    unsigned int srcHeight = im_dim.y*(float)resizeFactor;

    int gx = get_global_id(0);
    int gy = get_global_id(1);


    int skinX=0;
    int skinY=0;
    int coordi=vecSkin[512*gy+gx];
    skinX = coordi%im_dim.x;
    skinY = coordi/im_dim.x;

        if( skinX >= 10 && skinY >= 10 )
    {
        skinX -= 10;
        skinY -= 10;
    }   

    unsigned int windowWidth = classifierMem[0];
    unsigned int windowHeight = classifierMem[1];   

    if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
        bool stagePass = true;
        unsigned int index = 0;
        for(unsigned int i=numTotStage; i>0;i--){
            if(stagePass){
                unsigned int stageIndex = idxNumValStageArray[index++];
                float stageThres = classifierMem[stageIndex+2];
                float numFeatures = classifierMem[stageIndex+3];
                unsigned int featureIndex = stageIndex+4;
                float featureValue = 0.0;               

                for(unsigned int j=numFeatures; j>0;j--){
                    if(stagePass){
                        float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                        float succVal=classifierMem[featureIndex++];
                        float failVal=classifierMem[featureIndex++];
                        unsigned int numRegions = classifierMem[featureIndex++];
                        float regionValue =0.0;

                        for(unsigned int k=numRegions; k>0;k--){                    
                            float4 rectValue;
                            int4 regionP;

                            regionP.x=(int)(classifierMem[featureIndex])+skinX;
                            regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                            regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                            regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;

                            rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                            rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                            rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                            rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                            regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];

                            featureIndex+=5;
                        }

                        featureValue += (regionValue < featureThres)?failVal:succVal;                   

                        if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                    }
                }
                if(featureValue < stageThres)   stagePass =false;
                else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
            }
        }   
    }else return;
}

分析时间：原始版本（无本地内存）：24ms修改版本（有本地mem）：28ms

编辑：实际上localWorkSize为NULL，因为globalWorkSize总是随放置NDRangeKernel的向量大小而变化。当放置特定的localWorkSize时，人脸检测率下降...所以我尝试将localWorkSize放到NUll，然后人脸检测率很好。所以我想知道原因。

这是主机代码：

    //localWorkSize[0] = 16;
    //localWorkSize[1] = 12; 
    numThreadsX=512;
    globalWorkSize[0] = numThreadsX;
    globalWorkSize[1] =  vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
    errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL); 
    errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL); 
    errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
    errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
    errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
    errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);

    errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);

杰森·牛顿

原因多种多样：

内存负载的线程差异
障碍不是免费的
更多指示
在大多数平台上，恒定内存与共享内存几乎相同，因此额外的工作没有收获。
我们不知道保存了多少次读取共享内存版本-请告诉我们工作组的大小。它越小，从我们保存的非本地内存中读取的内容越少。这是否是恒定内存并不重要。

剖析这两个参数，然后查看探查器的输出是否以最大利用率来显示任何东西，并向我们表明那是什么。

另外，我也不相信您的localIdx和stage1Idx有意义，它们可能超出数组范围并导致异常行为。至少对于给定的gx / gy，您看起来好像正在使用来自classifierMem的不同索引。

本文收集自互联网，转载请注明来源。

如有侵权，请联系[email protected] 删除。

编辑于2021-06-11

我来说两句

0条评论

登录后参与评论

来自分类Dev

Related 相关文章

文章

移动OpenCL本地存储库冲突。为什么使用本地内存比内核中的全局内存要慢？

移动OpenCL本地存储库冲突。为什么使用本地内存比内核中的全局内存要慢？

opencl内核中的动态全局内存分配

OpenCL全局内存

Opencl - 将全局内存工作组 + 边界转移到本地内存

OpenCL中本地内存的优势是什么？

如何从全局内存到本地内存进行跨步复制？

OpenCL同步全局内存

将表单本地内存写入全局时是否需要本地和全局内存屏障

CUDA内核中全局内存负载事务的计算

有什么方法可以让内核根据数据大小使用常量或全局内存

OpenCL中的本地内存有什么优势？

使用向量将OpenCL中的全局内存读取最小化？

使用向量将OpenCL中的全局内存读取最小化？

为什么在此openCL内核代码中，localSum是本地的？

CUDA中的全局内存与动态全局内存分配

使用OpenCL优化本地内存使用

在OpenCL中使用本地内存并行减少

OpenCL中工作项和全局内存之间的内存传输？

OpenCL本地内存和Xcode

我们为什么要本地化全局库/引用？

将选择性元素存储在GPU全局内存中

CUDA将数据从全局内存缓存到统一缓存中，以将其存储到共享内存中吗？

opencl本地内存大小如何工作？

为OpenCL内核分配的额外32字节本地内存

线程与全局内存的交互

OpenCL：如何将全局内存的连续块复制到私有内存？

尝试将动态本地内存传递给内核时遇到访问冲突

是否可以使用JGit的内存中存储库将一个小文件添加到大的Git存储库而无需在本地检出它？

移动本地SVN存储库

为什么要备份本地或地理冗余的azure存储帐户？