我正在使用开发Android平台中的人脸检测应用程序OpenCL
。人脸检测算法基于Viola Jones算法。我试图制作Cascade分类步骤内核代码。我将classifier data
级联阶段1设置为级联阶段,local memory(__local)
因为分类器数据用于所有工作项。
但是,不使用本地内存(使用全局内存)的内核分析时间要比使用本地内存的速度更快。
编辑:
我上传了完整的代码。
带有本地内存版本
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
int cascadeLocalSize = get_local_size(0);
__local float localS1F1[42];
int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
if(localIdx<42)
{
int stage1Idx = localIdx + idxNumValStageArray[0]+4;
localS1F1[localIdx] = classifierMem[stage1Idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
int type = gx%3;
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
unsigned int stageIndex;
float stageThres;
float numFeatures;
unsigned int featureIndex;
float featureValue;
if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
if(index == 0){
stageIndex = idxNumValStageArray[0];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = 0;
featureValue = 0.0;
}
else{
stageIndex = idxNumValStageArray[index];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = stageIndex+4;
featureValue = 0.0;
}
float featureThres;
float succVal;
float failVal;
unsigned int numRegions;
float regionValue;
if(type ==0 && index==0)
{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}// end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(type ==1 && index ==0)
{
featureIndex +=14;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==1)
featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(index == 0)
{
featureIndex +=28;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==2) featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}// end of for(unsigned int k=numRegions; k>0;k--)
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}//end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
//stage
else{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
succVal=classifierMem[featureIndex++];
failVal=classifierMem[featureIndex++];
numRegions = classifierMem[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}
}else return;
}
原始版本(无本地内存)
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
unsigned int stageIndex = idxNumValStageArray[index++];
float stageThres = classifierMem[stageIndex+2];
float numFeatures = classifierMem[stageIndex+3];
unsigned int featureIndex = stageIndex+4;
float featureValue = 0.0;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
float succVal=classifierMem[featureIndex++];
float failVal=classifierMem[featureIndex++];
unsigned int numRegions = classifierMem[featureIndex++];
float regionValue =0.0;
for(unsigned int k=numRegions; k>0;k--){
float4 rectValue;
int4 regionP;
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}else return;
}
分析时间:原始版本(无本地内存):24ms修改版本(有本地mem):28ms
编辑:实际上localWorkSize为NULL,因为globalWorkSize总是随放置NDRangeKernel的向量大小而变化。当放置特定的localWorkSize时,人脸检测率下降...所以我尝试将localWorkSize放到NUll,然后人脸检测率很好。所以我想知道原因。
这是主机代码:
//localWorkSize[0] = 16;
//localWorkSize[1] = 12;
numThreadsX=512;
globalWorkSize[0] = numThreadsX;
globalWorkSize[1] = vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL);
errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL);
errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);
errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);
原因多种多样:
剖析这两个参数,然后查看探查器的输出是否以最大利用率来显示任何东西,并向我们表明那是什么。
另外,我也不相信您的localIdx和stage1Idx有意义,它们可能超出数组范围并导致异常行为。至少对于给定的gx / gy,您看起来好像正在使用来自classifierMem的不同索引。
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句