CUDA C-使用clock（）和cudaEvent的CPU和GPU执行时间，对吗？

Avinash 发表于 Dev

阿维纳什

我编写了一个添加两个2D数组的程序，以检查CPU和GPU的性能。我使用clock（）函数来测量CPU执行，并使用cudaEvent来测量GPU中的内核执行时间。由于我是在Udacity下学习CUDA的，所以我尝试在他们的服务器上执行该程序，结果是，

 Output:
 GPU: 0.001984 ms
 CPU : 30.000000 ms

现在我要问一个真正的问题，我发现这些结果在GPU上的运行速度惊人，现在我对这些结果是否准确还是我在程序中犯了任何错误表示怀疑？

这是我的程序：

 #include "stdio.h"
 #include<time.h>
 #define COLUMNS 900
 #define ROWS 900
 long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];
__global__ void add(long *a, long *b, long *c,long *d)
{
 int x = blockIdx.x;
 int y = blockIdx.y;
 int i = (COLUMNS*y) + x;
 c[i] = a[i] + b[i];
 a[i]=d[i];
}

int main()
{
  long *dev_a, *dev_b, *dev_c,*dev_d;
  float ms;
  clock_t startc, end;
  double cpu_time_used;
  cudaEvent_t start,stop;


 cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));

 startc = clock();
 for (long y = 0; y < ROWS; y++) // Fill Arrays
 for (long x = 0; x < COLUMNS; x++)
 {
     a[y][x] = x;
     b[y][x] = y;
     d[y][x]=rand()%4;
     c[y][x]=a[y][x]+b[y][x];
 }
 end = clock();

cpu_time_used = ((double) (end - startc)) / CLOCKS_PER_SEC;
cpu_time_used*=1000;


cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0); 
cudaEventRecord(stop, 0);


add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);

cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);




printf("GPU: %f ms",ms);
printf("\n CPU : %f ms",cpu_time_used);

 return 0;
}

谢谢大家为我的查询提供的答案，这是我对代码所做的更改和更新的结果，

更新的代码：

#include "stdio.h"
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#define COLUMNS 500
#define ROWS 500
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];



__global__ void add(long *a, long *b, long *c,long *d)
{
 int x = blockIdx.x;
 int y = blockIdx.y;
 int i = (COLUMNS*y) + x;
 c[i] = a[i] + b[i];
 a[i]=d[i];
}
int main()
{
 long *dev_a, *dev_b, *dev_c,*dev_d;
 struct timeval startc, end;
 float ms;
 long mtime, seconds, useconds;
 //   clock_t startc, end;
 //  double cpu_time_used;
 long ns;
 cudaEvent_t start,stop;


 cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));

 gettimeofday(&startc, NULL);
 for (long y = 0; y < ROWS; y++) // Fill Arrays
 for (long x = 0; x < COLUMNS; x++)
 {
  a[y][x] = x;
  b[y][x] = y;
  d[y][x]=rand()%4;
  c[y][x]=a[y][x]+b[y][x];
 }
  gettimeofday(&end, NULL);

 seconds  = end.tv_sec  - startc.tv_sec;
 useconds = end.tv_usec - startc.tv_usec;
 mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;


for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
 {
 for (long x = COLUMNS-1; x < COLUMNS; x++)
 {
    // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
   //   printf("[%d][%d]=%d ",y,x,d[y][x]);
 }
 printf("\n");
 }



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0); 



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);

cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);



//cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("GPU: %f ms",ms);
printf("\n CPU : %ld ms",mtime);
 for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
 {
     for (long x = COLUMNS-1; x < COLUMNS; x++)
     {
      //   printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
      //   printf("[%d][%d]=%d ",y,x,d[y][x]);
     }
     printf("\n");
 }
 return 0;
}

输出：

GPU: 0.011040 ms
CPU : 9 ms

现在我可以安全地说出它是否正确吗？

克里斯蒂安·萨罗芬（Christian Sarofeen）

您认为速度过高，CPU的时间太长是正确的。使用此方法来计时CPU C ++在Linux上获得毫秒级时间-clock（）似乎无法正常工作，您可能还必须将其cudaEventRecord(stop, 0);移至内核之后。

我在您的内核中看到5次读写。以5*4Bytes*500*500/(1024^3*0.009)你得到关于0.517 GB/s你的记忆，这是可用的一小部分。我会说您的CPU版本需要一些工作。相反，您的GPU处于5*4Bytes*500*500/(1024^3*0.01104e-3)左右421GB/s。我会说你还不在那里。

这么多错误...

#include "stdio.h"
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define COLUMNS 500
#define ROWS 500
long a[ROWS*COLUMNS], b[ROWS*COLUMNS], c[ROWS*COLUMNS],d[ROWS*COLUMNS];



__global__ void add(long *a, long *b, long *c,long *d)
{
 int x = blockIdx.x;
 int y = blockIdx.y;
 int i = (COLUMNS*y) + x;
 c[i] = a[i] + b[i];
 a[i]=d[i];
}
int main()
{
 long *dev_a, *dev_b, *dev_c,*dev_d;
 struct timeval startc, end;
 float ms;
 long seconds, useconds;
 double mtime;
 cudaEvent_t start,stop;


 for(int i=0; i<ROWS*COLUMNS; i++)
     d[i]=rand()%4;

 for(int i=0; i<ROWS; i++){
     for(int j=0; j<COLUMNS; j++){
         a[i*COLUMNS+j]=j;
         b[i*COLUMNS+j]=i;
     }
 }

 cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
 cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));



 gettimeofday(&startc, NULL);
 for (long i = 0; i < ROWS*COLUMNS; i++){ // Fill Arrays
     c[i]=a[i]+b[i];
     a[i]=d[i];
 }
  gettimeofday(&end, NULL);

 seconds  = end.tv_sec  - startc.tv_sec;
 useconds = end.tv_usec - startc.tv_usec;
 mtime = useconds;
 mtime/=1000;
 mtime+=seconds*1000;

for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
 {
 for (long x = COLUMNS-1; x < COLUMNS; x++)
 {
    // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
   //   printf("[%d][%d]=%d ",y,x,d[y][x]);
 }
 printf("\n");
 }



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);



cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);

printf("GPUassert: %s\n", cudaGetErrorString(cudaGetLastError()));

//cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
double memXFers=5*4*COLUMNS*ROWS;
memXFers/=1024*1024*1024;


printf("GPU: %f ms bandwidth %g GB/s",ms, memXFers/(ms/1000.0));
printf("\n CPU : %g ms bandwidth %g GB/s",mtime, memXFers/(mtime/1000.0));
 for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
 {
     for (long x = COLUMNS-1; x < COLUMNS; x++)
     {
      //   printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
      //   printf("[%d][%d]=%d ",y,x,d[y][x]);
     }
     printf("\n");
 }

 return 0;
}

顺便说一下，我目前的结果（显然不正确）...

GPU: 0.001792 ms bandwidth 2598.56 GB/s
CPU : 0.567 ms bandwidth 8.21272 GB/s

本文收集自互联网，转载请注明来源。

如有侵权，请联系[email protected] 删除。

编辑于2021-02-20

我来说两句

0条评论

登录后参与评论

来自分类Dev

Related 相关文章

文章

CUDA C-使用clock（）和cudaEvent的CPU和GPU执行时间，对吗？

CUDA C-使用clock（）和cudaEvent的CPU和GPU执行时间，对吗？

C和C ++中执行时间的差异

如何使用CUDA C执行矩阵加法

CUDA和C ++中的名称处理

CUDA和C ++中的名称处理

CUDA 程序不测量执行时间：cudaEventRecord

可以远程运行CUDA C吗？

使用CMake使用CUDA代码编译c ++

具有和不具有参数C / C ++的函数之间的执行时间

在C＃中使用外部C / C ++ CUDA库

CUDA和C ++函数问题（Visual Studio 2013）

在QTCreator中用MinGW编译c ++和cuda代码

Visual Studio 2013中的C ++和Cuda速度

使用C程序获取Python脚本的执行时间

在Windows中使用C测量执行时间

交叉编译c / c ++ / cuda程序时尝试使用CMake

分段错误C ++ Cuda

Cuda数学与C ++数学

OpenCV 3.4：CPU和CUDA中调整大小的结果在C ++中不匹配

确定ASP.NET WEB API中每个API的执行时间和跟踪，并使用C＃使用log4net登录到文件

C和C ++中几乎相同的代码在执行时间上的巨大差异（x9）

如何在Cuda和C ++代码中进行可比的时间测量

CUDA C ++：64号有什么特别之处吗？

使用多个CUDA GPU

为什么我的CUDA内核执行时间会随着连续启动而增加？

相对于块数测量cuda执行时间

如何为初学者使用恒定内存（Cuda C）

使用CMAKE编译CUDA C ++-指定了多个编译阶段

如何使用CMake将C ++编译为CUDA

在CUDA C中使用原子操作时出错