GPU上的矩阵乘法。内存库冲突和延迟隐藏

debugcn 发表于 Dev

侯赛因·图格鲁·布基基西克

编辑：随着时间的推移成就已列在该问题的末尾（〜1Tflops / s尚未）。

我使用C ++ DLL中的opencl（gpu）为C＃编写了某种数学库，并且已经对单精度方阵矩阵乘法进行了一些优化（出于学习目的和以后在神经网络程序中重新使用的可能性）。下面的内核代码将v1 1D数组作为matrix1（1024x1024）的行将v2 1D数组作为matrix2（（1024x1024）转置优化）的列并将结果放入v3 1D数组作为matrix-3的行。（1024x1024）

目前，HD7870的1024x1024方阵矩阵乘法的内核执行时间为3.6 ms。

已完成优化：

第二矩阵的换位（改善时间）
使用32x32子矩阵在本地内存中进行计算（4x 16x16，因为在我的HD7870上最大工作组大小为256，并且gpu由于某些原因不接受超过24kB的本地内存，但在线消息来源说64kB？）（无论如何，可以节省很多时间）
在将结果写入本地和全局之前，增加了使用私有变量重复使用数据的时间（缩短了时间）
列主要访问最内层循环中的本地2D数组。（延长时间）
每个补丁将加法共享到两个累加器寄存器中。（改善了时间，降低了数值稳定性）
循环展开最里面的循环并没有改善时间（第四次展开后甚至变得更糟）（因此必须放松整数alu）

问题：我无法完成一些优化，例如消除所有本地（lds）库冲突和指令重新排序以隐藏内存延迟。我该怎么做才能改善此数学函数的性能？

此内核肯定是受本地内存带宽（冲突）限制的，具有3.2 ms的乘法=

（1024 * 1024 * 1024 *（1和+1倍= 2）/ 0.0036秒）= 596x10 ^ 9每秒翻转（596 GFlops）我在GTX680上看到一些CUDA的在线基准测试，它们已经打破了1TFlops点。因为每个计算单元有更多本地内存，还是有更多内核，或者两者兼而有之？

（1024 * 1024 * 1024 *（2个浮点读取）*（每个浮点4个字节）/0.0036秒）= 2386x10 ^ 9字节/秒但是此内核读取8个浮点并将它们使用16次，数据重用为2每个浮点数。

2386x10 ^ 9字节/重用（2）= 1193 GB / s

HD7870的理论最大值为：此处，附录D

计算能力= 2560 Giga浮点运算/秒，LDS带宽= 2560 GB / s，寄存器访问带宽= 15360 GB / s

这是内核：

__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3) 
{
    int localRow = get_local_id(0); 
    int localCol = get_local_id(1);  
    int selectRowFromA = get_group_id(0)*32;     
    int selectColFromB = get_group_id(1)*32;     
    int lid= localCol*16+localRow; 
    __local float Lcache1[ 16][ 16]; 
    __local float Lcache2[ 16][ 16]; 
    __local float Lcache3[ 16][ 16]; 

    __local float Lcache1a[ 16][ 16]; 
    __local float Lcache2a[ 16][ 16]; 
    __local float Lcache3a[ 16][ 16]; 

    __local float Lcache1b[ 16][ 16]; 
    __local float Lcache2b[ 16][ 16]; 
    __local float Lcache3b[ 16][ 16]; 

    __local float Lcache1c[ 16][ 16]; 
    __local float Lcache2c[ 16][ 16]; 
    __local float Lcache3c[ 16][ 16]; 

    float tmp0=0.0f; 
    float tmp1=0.0f; 
    float tmp2=0.0f; 
    float tmp3=0.0f; 

    float tmp4=0.0f; 
    float tmp5=0.0f; 
    float tmp6=0.0f; 
    float tmp7=0.0f; 

    float sumPatch=0.0f; 
    float sumPatcha=0.0f; 
    float sumPatchb=0.0f; 
    float sumPatchc=0.0f; 
    float sumPatch2=0.0f; 
    float sumPatcha2=0.0f; 
    float sumPatchb2=0.0f; 
    float sumPatchc2=0.0f; 

    barrier(CLK_LOCAL_MEM_FENCE); 
    Lcache3[localRow][localCol]=0.0f; 
    Lcache3a[localRow][localCol]=0.0f; 
    Lcache3b[localRow][localCol]=0.0f; 
    Lcache3c[localRow][localCol]=0.0f; 
    barrier(CLK_LOCAL_MEM_FENCE); 
    for(int i=0;i<1024;i+=32)  // this is A's row and B's column parsed by sub-matrices
    { 
        barrier(CLK_LOCAL_MEM_FENCE); 
        Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
        Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
        Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
        Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
        Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
        Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
        Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
        Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
        barrier(CLK_LOCAL_MEM_FENCE); 
        sumPatch=0.0f; 
        sumPatcha=0.0f; 
        sumPatchb=0.0f; 
        sumPatchc=0.0f; 
        sumPatch2=0.0f; 
        sumPatcha2=0.0f; 
        sumPatchb2=0.0f; 
        sumPatchc2=0.0f; 
        for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
        {   
            read_mem_fence(CLK_LOCAL_MEM_FENCE); 
            tmp0=Lcache1[kk][localRow];  // row-major
            tmp1=Lcache1a[kk][localRow]; // accesses
            tmp2=Lcache1b[kk][localRow]; //to local memory
            tmp3=Lcache1c[kk][localRow]; 
            tmp4=Lcache2[kk][localCol]; 
            tmp5=Lcache2a[kk][localCol]; 
            tmp6=Lcache2b[kk][localCol]; 
            tmp7=Lcache2c[kk][localCol]; 
            read_mem_fence(CLK_LOCAL_MEM_FENCE); 
            sumPatch+=tmp0*tmp4; 
            sumPatcha+=tmp0*tmp6; 
            sumPatchb+=tmp2*tmp4; 
            sumPatchc+=tmp2*tmp6; 
            sumPatch2+=tmp1*tmp5; 
            sumPatcha2+=tmp1*tmp7; 
            sumPatchb2+=tmp3*tmp5; 
            sumPatchc2+=tmp3*tmp7; 
        } 
        Lcache3[localRow][localCol]+=sumPatch+sumPatch2; 
        Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2; 
        Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2; 
        Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2; 
    } 
    barrier(CLK_LOCAL_MEM_FENCE); 
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];                   
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];              
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];     
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];     
    barrier(CLK_LOCAL_MEM_FENCE); 
}

这是我试图消除存储区冲突的方法，但是内核执行时间增加了大约％20：

for(int kk=0;kk< 16;kk++) 
{   
    int nc=(kk+lid)&15;//different for all local threads
                       //but does not exceed 0-15 range
                       //summation order is not important
                       //0.+1.+...15. or 14.+15.+0.+..13.
                       //gives correct answer
    read_mem_fence(CLK_LOCAL_MEM_FENCE); 
    tmp0=Lcache1[nc][localRow]; 
    tmp1=Lcache1a[nc][localRow]; 
    tmp2=Lcache1b[nc][localRow]; 
    tmp3=Lcache1c[nc][localRow]; 
    tmp4=Lcache2[nc][localCol]; 
    tmp5=Lcache2a[nc][localCol]; 
    tmp6=Lcache2b[nc][localCol]; 
    tmp7=Lcache2c[nc][localCol]; 
    read_mem_fence(CLK_LOCAL_MEM_FENCE);
    sumPatch+=tmp0*tmp4;
    sumPatcha+=tmp0*tmp6;
    sumPatchb+=tmp2*tmp4;
    sumPatchc+=tmp2*tmp6;
    sumPatch2+=tmp1*tmp5;
    sumPatcha2+=tmp1*tmp7;
    sumPatchb2+=tmp3*tmp5;
    sumPatchc2+=tmp3*tmp7;
}

这可能是新GPU的广播技术吗？同样，超过16个元素的总和意味着仅使用16个存储库？该设备具有32个用于本地访问的存储区。

这是我尝试隐藏的内存延迟：

for(int kk=0;kk< 16;kk++) 
{   
    int nc=(kk+lid)&15;//different for all local threads
                       //but does not exceed 0-15 range
                       //summation order is not important
                       //0.+1.+...15. or 14.+15.+0.+..13.
                       //gives correct answer
    read_mem_fence(CLK_LOCAL_MEM_FENCE); 
    tmp0=Lcache1[nc][localRow]; 
    tmp4=Lcache2[nc][localCol];
    sumPatch+=tmp0*tmp4; 
    tmp6=Lcache2b[nc][localCol];
    sumPatcha+=tmp0*tmp6; 
    tmp1=Lcache1a[nc][localRow];
    tmp7=Lcache2c[nc][localCol]; 
    sumPatcha2+=tmp1*tmp7; 
    tmp5=Lcache2a[nc][localCol];
    sumPatch2+=tmp1*tmp5; 
    tmp2=Lcache1b[nc][localRow]; 
    sumPatchb+=tmp2*tmp4;
    sumPatchc+=tmp2*tmp6; 
    tmp3=Lcache1c[nc][localRow]; 
    sumPatchb2+=tmp3*tmp5;
    sumPatchc2+=tmp3*tmp7;  
    read_mem_fence(CLK_LOCAL_MEM_FENCE);//this lines' position does not change time 
}

但这并没有增加或减少exec。时间。

如何改善内核时间？可以吗

设备：HD7870 @ 1000MHz / 1200MHz主机：FX8150 @ 4GHz标头，来自Khronos网站的LIB文件，来自AMD驱动程序的opencl.dll。

时间采样是通过以下方式完成的：循环100次内核，并将总时间除以100.0（来自Stopwatchstart（）和stop（）方法）。仅用于执行，不包括阵列副本。

将所有结果与具有相同随机矩阵输入的朴素3嵌套循环版本进行比较（结果在m（ij）+/- delta内，其中delta为0.001f。）

此处的内核是更通用的内核的简化版本（适用于不同的矩阵和补丁大小）

此版本的内核参数：全局= 512,512本地= 16,16，参考== 0

对于8320x8320矩阵-> Global = 4160,4160，Local = 16,16，ref = 0,0 time = 1.87Seconds

编辑：通过DarkZeros的建议，通过私有版本替换本地Lcache3可以将1024x1024时间提高到2.7 ms。这是每秒795 Glops。这必须来自更好的职业比例。

Edit2：较少的本地使用率打开了使用48x48（9 x 16x16）补丁的可能性，这些补丁使1056x1056乘以2.4 ms ----> 981 Gflops / s。8208x8208在961毫秒内完成，超过了1150 GFlops。

黑暗零

为什么有那么多栅栏？实际上，我认为您甚至根本不需要它们。仅当写入本地的线程将被其他线程读取时，您才需要隔离。当该线程读取并写入其本地内存时不行。

BTW围栏比障碍要好得多。在障碍中，您强制线程同步。在某些情况下，这会降低性能。

我认为您可以通过更改内存访问模型来重写代码以提高速度。

您可以尝试一下是否可以更好地工作（我做了很多明显的优化，甚至都不知道您的代码在做什么）：

__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3) 
{
    int localRow = get_local_id(0); 
    int localCol = get_local_id(1);  
    int selectRowFromA = get_group_id(0)*32;     
    int selectColFromB = get_group_id(1)*32;     
    int lid= localCol*16+localRow; 
    __local float Lcache1[ 16][ 16]; 
    __local float Lcache2[ 16][ 16]; 
    __local float Lcache3[ 16][ 16]; 

    __local float Lcache1a[ 16][ 16]; 
    __local float Lcache2a[ 16][ 16]; 
    __local float Lcache3a[ 16][ 16]; 

    __local float Lcache1b[ 16][ 16]; 
    __local float Lcache2b[ 16][ 16]; 
    __local float Lcache3b[ 16][ 16]; 

    __local float Lcache1c[ 16][ 16]; 
    __local float Lcache2c[ 16][ 16]; 
    __local float Lcache3c[ 16][ 16]; 

    float tmp0=0.0f; 
    float tmp1=0.0f; 
    float tmp2=0.0f; 
    float tmp3=0.0f; 

    float tmp4=0.0f; 
    float tmp5=0.0f; 
    float tmp6=0.0f; 
    float tmp7=0.0f; 

    float sumPatch=0.0f; 
    float sumPatcha=0.0f; 
    float sumPatchb=0.0f; 
    float sumPatchc=0.0f; 
    float sumPatch2=0.0f; 
    float sumPatcha2=0.0f; 
    float sumPatchb2=0.0f; 
    float sumPatchc2=0.0f; 

    Lcache3[localRow][localCol]=0.0f; 
    Lcache3a[localRow][localCol]=0.0f; 
    Lcache3b[localRow][localCol]=0.0f; 
    Lcache3c[localRow][localCol]=0.0f; 
    for(int i=0;i<1024;i+=32)  // this is A's row and B's column parsed by sub-matrices
    { 
        Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
        Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
        Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
        Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
        Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
        Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
        Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
        Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
        mem_fence(CLK_LOCAL_MEM_FENCE);  
        sumPatch=0.0f; 
        sumPatcha=0.0f; 
        sumPatchb=0.0f; 
        sumPatchc=0.0f; 
        sumPatch2=0.0f; 
        sumPatcha2=0.0f; 
        sumPatchb2=0.0f; 
        sumPatchc2=0.0f; 
        for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
        {   
            tmp0=Lcache1[kk][localRow];  // row-major
            tmp1=Lcache1a[kk][localRow]; // accesses
            tmp2=Lcache1b[kk][localRow]; //to local memory
            tmp3=Lcache1c[kk][localRow]; 
            tmp4=Lcache2[kk][localCol]; 
            tmp5=Lcache2a[kk][localCol]; 
            tmp6=Lcache2b[kk][localCol]; 
            tmp7=Lcache2c[kk][localCol]; 
            sumPatch+=tmp0*tmp4; 
            sumPatcha+=tmp0*tmp6; 
            sumPatchb+=tmp2*tmp4; 
            sumPatchc+=tmp2*tmp6; 
            sumPatch2+=tmp1*tmp5; 
            sumPatcha2+=tmp1*tmp7; 
            sumPatchb2+=tmp3*tmp5; 
            sumPatchc2+=tmp3*tmp7; 
        } 
        Lcache3[localRow][localCol]+=sumPatch+sumPatch2; 
        Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2; 
        Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2; 
        Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2; 
    } 
    mem_fence(CLK_LOCAL_MEM_FENCE); 
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];                   
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];              
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];     
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];     

}

本文收集自互联网，转载请注明来源。

如有侵权，请联系[email protected] 删除。

编辑于2021-06-2

我来说两句

0条评论

登录后参与评论

来自分类Dev

Related 相关文章

文章

GPU上的矩阵乘法。内存库冲突和延迟隐藏

GPU上的矩阵乘法。内存库冲突和延迟隐藏

Tensorflow：GPU上矩阵乘法（NaN）的错误结果

CUDA：具有共享内存和矩阵大小（不是块大小的倍数）的平铺矩阵矩阵乘法

Strassen矩阵乘法的内存管理

矩阵和张量乘法

不同 GPU 上的 Tensorflow 执行和内存

在Mat矩阵上执行简单的逆和乘法运算

在某些情况下，GPU上的矩阵乘法/加法运算非常慢

iOS-GPU加速矩阵转置，乘法和本征分解难题

Numpy 数组和矩阵乘法

显示/隐藏和滑块冲突

幼稚矩阵乘法的OpenCL CPU比OpenCL GPU更快

numpy：矩阵乘法比矢量和快吗？

向量化和嵌套矩阵乘法

R中的矩阵和向量乘法运算

矩阵和向量的Fortran元素明智乘法

numpy中的矩阵重塑和乘法

numpy：矩阵乘法比矢量和快吗？

矩阵和向量乘法python3

numpy矩阵乘法每个col和sum

具有简单矩阵乘法的Scipy稀疏内存爆炸

Python 矩阵乘法索引交换内存使用示例

什么是GPU上的相干内存？

CUDA共享内存库冲突报告较高

大矩阵和内存问题

MPI库和内存

如何无需测量即可获取/计算GPU的内存延迟？

MATLAB GPU-CUDA内存副本的延迟时间？

使用“异或”，“非”或“布尔”乘法（使用本征矩阵库）

同态加密：使用PALISADE库进行矩阵矢量乘法