C ++ AMP vs推力vs opencl [英] C++AMP vs thrust vs opencl

查看:98
本文介绍了C ++ AMP vs推力vs opencl的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用OpenCL在GPU中编程,但我很乐意使用更简单的系统来并行化程序



它们意味着更少的代码需要更改才能引入GPU?

C ++ amp和Trust允许在返回结果之前在GPU内顺序运行几个函数?



我尝试过:



我使用OpenCL编写代码但是必须完成大量工作才能并行化算法,即系统准备,缓冲计算,数据传输,...

解决方案

但必须完成大量的工作才能并行化算法,即系统准备,缓冲ctreation,数据传输





如果你为每一个新的编程工作重写整个OpenCL阶段,那么你需要缩短写作部分通过使用OOP,例如创建一个对象,在R中为你做除了OpenCL的特殊部分AII样式规则使得您编写的1个内容将等于OpenCL API的10行,您将不必记住资源释放操作的顺序。



如果您不想重新发明自己的轮子,如果您对错误修正和功能集成的官僚机构的延迟感到满意,那么还有其他轮子。



Sycl:



 std :: vector h_a(LENGTH );  //   a vector  
std :: vector h_b(LENGTH); // b vector
std :: vector h_c(LENGTH); // c vector
std :: vector h_r(LENGTH,0xdeadbeef); // d vector(result)
/ / 使用随机浮点值填充向量a和b
int count = LENGTH ;
for int i = 0 ; i< count; i ++){
h_a [i] = rand()/( float )RAND_MAX;
h_b [i] = rand()/( float )RAND_MAX;
h_c [i] = rand()/( float )RAND_MAX;
}
{
// 设备缓冲区
缓冲区d_a(h_a);
buffer d_b(h_b);
buffer d_c(h_c);
buffer d_r(h_d);
queue myQueue;
command_group(myQueue,[&]()
{
// 数据访问者
auto a = d_a.get_access< access :: read>();
auto b = d_b.get_access< access :: read>();
auto c = d_c.get_access< access :: read> ;();
auto r = d_r.get_access< access :: write>();
// 内核
parallel_for(count,kernel_functor([=](id<> item){
int i = item.get_global( 0 );
r [i] = a [i] + b [ i] + c [i];
}));
});
}





阵火:



<前lang = c ++> // 选择一个设备并显示arrayfire信息
int device = argc> 1 ? atoi(argv [ 1 ]): 0 ;
af :: setDevice(device);
af :: info();
printf( 在GPU \ n上创建一个5乘3的随机浮点矩阵< /跨度>);
array A = randu( 5 3 ,f32);
af_print(A);
printf( 按元素算术\ n);
array B = sin(A)+ 1 5 ;
af_print(B);
printf( 否定第二列的前三个元素\ n);
B(seq( 0
2 ), 1 )= B(seq( 0
2 ), 1 )* - 1 ;
af_print(B);
printf( 傅里叶变换结果\ n);
array C = fft(B);
af_print(C);
printf( 抓住最后一行\ n);
array c = C.row(end);
af_print(c);
printf( Scan Test \ n);
dim4 dims( 16 4 1 1 );
array r =常量( 2 ,dims);
af_print(r);
printf( Scan \ n);
array S = af :: scan(r, 0 ,AF_BINARY_MUL);
af_print(S);
printf( 从主机数据创建2×3矩阵\ n) ;
float d [] = { 1 2 3 4 5 6 };
array D( 2 3 ,d,afHost);
af_print(D);
printf( 将最后一列复制到第一列\ n);
D.col( 0 )= D.col(结束);
af_print(D);
// 排序A
printf( 排序A并打印排序的数组和相应的索引\ n);
array vals,inds;
sort(vals,inds,A);
af_print(vals);
af_print(inds);





使用运行时API,CUDA比OpenCL更容易进入。如果您可以使用Nvidia硬件,那么请看一下:



  #include   <   stdio.h  >  

__global__
void saxpy ( int n, float a, float * x, float * y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i< n)y [i] = a * x [i] + y [i];
}

int main( void
{
int N = 1<< 20;
float * x,* y,* d_x,* d_y;
x =( float *)malloc(N * sizeof ));
y =( float *)malloc(N * sizeof ));

cudaMalloc(& d_x,N * sizeof float ));
cudaMalloc(& d_y,N * sizeof float ));

for int i = 0 ; i< N; i ++){
x [i] = 1 .0f;
y [i] = 2 .0f;
}

cudaMemcpy(d_x,x,N * sizeof float ),cudaMemcpyHostToDevice);
cudaMemcpy(d_y,y,N * sizeof float ),cudaMemcpyHostToDevice);

// 对1M元素执行SAXPY
saxpy<< ;<(N + 255)/ 256,256>>>(N, 2 .0f,d_x,d_y);

cudaMemcpy(y,d_y,N * sizeof float ),cudaMemcpyDeviceToHost );

float maxError = 0 .0f;
for int i = 0 ; i< N; i ++)
maxError = max(maxError,abs(y [i] - 4 .0f));
printf( 最大错误:%f \ n,maxError);

cudaFree(d_x);
cudaFree(d_y);
免费(x);
free(y);
}





但如果您更喜欢CUDA的驱动程序API,那么它将像OpenCL一样难以作为学习曲线。但是,当然,通过足够的封装,您可以制作自己的单行算法来对事物进行排序,映射和计算。此外,CUDA还有许多工具可以帮助开发人员进行错误修正,优化和计算(因此您可能不需要为自己创建FFT,而是直接使用CUDA平台中的相关工具)。


I program in GPUs using OpenCL but I would be happy with a easier system to parallelize the program

What of them implies less code to be changed to introduce in GPU?
C++ amp and Trust allows run several functions sequentially inside the GPU before returning results?

What I have tried:

I made code using OpenCL but a lot of work must be done added to the parallelizing the algorithm that is the system preparision, buffer ctreation, data transfers, ...

解决方案

but a lot of work must be done added to the parallelizing the algorithm that is the system preparision, buffer ctreation, data transfers



If you are re-writing whole OpenCL stages for every new programming work, then you need to shorten the writing part by using OOP such as creating an object for doing not all but specialized parts of OpenCL for you within RAII style rules so that what you write 1 liner will be equal to 10s of lines of OpenCL API and you won't have to remember in which order of "resource release" operations will be done.

If you don't want to re-invent your own wheel and if you are ok with the latency of bureaucracy for bugfixing and feature integrations, then there are others wheels.

Sycl:

std::vector h_a(LENGTH);             // a vector
  std::vector h_b(LENGTH);             // b vector
  std::vector h_c(LENGTH);             // c vector
  std::vector h_r(LENGTH, 0xdeadbeef); // d vector (result)
  // Fill vectors a and b with random float values
  int count = LENGTH;
  for (int i = 0; i < count; i++) {
    h_a[i] = rand() / (float)RAND_MAX;
    h_b[i] = rand() / (float)RAND_MAX;
    h_c[i] = rand() / (float)RAND_MAX;
  }
  {
    // Device buffers
    buffer d_a(h_a);
    buffer d_b(h_b);
    buffer d_c(h_c);
    buffer d_r(h_d);
    queue myQueue;
    command_group(myQueue, [&]()
    {
      // Data accessors
     auto a = d_a.get_access<access::read>();
     auto b = d_b.get_access<access::read>();
     auto c = d_c.get_access<access::read>();
     auto r = d_r.get_access<access::write>(); 
      // Kernel
      parallel_for(count, kernel_functor([ = ](id<> item) {
        int i = item.get_global(0);
        r[i] = a[i] + b[i] + c[i];
      }));
    });
  }



Arrayfire:

// Select a device and display arrayfire info
int device = argc > 1 ? atoi(argv[1]) : 0;
af::setDevice(device);
af::info();
printf("Create a 5-by-3 matrix of random floats on the GPU\n");
array A = randu(5,3, f32);
af_print(A);
printf("Element-wise arithmetic\n");
array B = sin(A) + 1.5;
af_print(B);
printf("Negate the first three elements of second column\n");
B(seq(0, 2), 1) = B(seq(0, 2), 1) * -1;
af_print(B);
printf("Fourier transform the result\n");
array C = fft(B);
af_print(C);
printf("Grab last row\n");
array c = C.row(end);
af_print(c);
printf("Scan Test\n");
dim4 dims(16, 4, 1, 1);
array r = constant(2, dims);
af_print(r);
printf("Scan\n");
array S = af::scan(r, 0, AF_BINARY_MUL);
af_print(S);
printf("Create 2-by-3 matrix from host data\n");
float d[] = { 1, 2, 3, 4, 5, 6 };
array D(2, 3, d, afHost);
af_print(D);
printf("Copy last column onto first\n");
D.col(0) = D.col(end);
af_print(D);
// Sort A
printf("Sort A and print sorted array and corresponding indices\n");
array vals, inds;
sort(vals, inds, A);
af_print(vals);
af_print(inds);



CUDA has an easier entrance than OpenCL with its runtime API. If you are ok with being resticted to Nvidia hardware, then have a look at this:

#include <stdio.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}



but if you prefer CUDA's driver-API then it will be as hard as OpenCL as a learning curve. But ofcourse, with enough encapsulation, you can make your own 1-liner algorithms to sort, map and compute things. Also CUDA has many tools to aid developers in bugfixing, optimizing and computing (so that you may not need to invent a FFT for yourself but directly use the relevant tool in CUDA platform).


这篇关于C ++ AMP vs推力vs opencl的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆