将cuBLAS与Thrust中的复数一起使用 [英] Using cuBLAS with complex numbers from Thrust

查看:165
本文介绍了将cuBLAS与Thrust中的复数一起使用的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

在我的代码中,我使用推力库中具有复数的数组,并且我想使用cublasZgeam()来转置数组。



使用cuComplex.h中的复数不是一个可取的选择,因为我在数组上做了很多算术运算,而cuComplex还没有定义* * =等运算符。 / p>

这就是我定义要转置的数组的方式

  thrust :: complex< float> u [xmax] [xmax]; 

我发现了这个 https://github.com/jtravs/cuda_complex ,但按如下方式使用它:

  #include cuComplex.hpp 

在通过nvcc编译时,不允许我使用提到的运算符

 错误:没有运算符 + =匹配以下操作数
操作数类型为:cuComplex + = cuComplex

对此是否有解决方案?来自github的代码很旧,可能存在问题,或者我使用的是错误的



编辑:这是可行的代码,只有与talonmies代码不同的是添加了简单的内核并指向相同数据但指向相同的指针:: complex

  #include< iostream> 
#include< thrust / fill.h>
#include< thrust / complex.h>
#include< cublas_v2.h>

使用命名空间std;

__global__无效测试(推力:: complex< double> * u){

u [0] + =推力:: complex< double((3.3,3.3);
}

int main()
{
int xmax = 100;
推力:: complex< double> u [xmax] [xmax];
double arrSize = sizeof(thrust :: complex< double>)* xmax * xmax;

推力:: fill(& u [0] [0],& u [0] [0] +(xmax * xmax),推力:: complex< double>(1.0,1.0 ));
u [49] [51] + =推力:: complex< double>(665.0,665.0);
u [51] [49] * = 2.0;

cout<< 之前:<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔
cout<< u [0] [0]<<恩德尔

推力:: complex< double> alpha(1.0,0.0);
推力:: complex< double> beta(0.0,0.0);
cublasHandle_t句柄;
cublasCreate(& handle);

cuDoubleComplex * d_u;
cuDoubleComplex * d_v;
cuDoubleComplex * _alpha = reinterpret_cast< cuDoubleComplex *>(& alpha);
cuDoubleComplex * _beta = reinterpret_cast< cuDoubleComplex *>(& beta);
cudaMalloc(& d_u,arrSize);
cudaMalloc(& d_v,arrSize);
cudaMemcpy(d_u,& u [0] [0],arrSize,cudaMemcpyHostToDevice);
推力:: complex< double> * d_vTest = reinterpret_cast<推力:: complex< double> *>(d_v);
cublasZgeam(句柄,CUBLAS_OP_T,CUBLAS_OP_N,xmax,xmax,
_alpha,d_u,xmax,
_beta,d_u,xmax,
d_v,xmax); <<< 1,1>>>(d_vTest);
cudaMemcpy(u,d_v,arrSize,cudaMemcpyDeviceToHost);
cout<< 之后:<<恩德尔
cout<< u [0] [0]<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔

返回0;
}


解决方案

尽管您提出了相反的抗议,C ++标准库 complex (或 thrust :: complex )最肯定可以在CUBLAS中使用。 cuComplex cuDoubleComplex 设计为与标准主机复杂类型二进制兼容,因此在传递数据时不进行翻译



对您在注释中发布的代码进行简单的修改完全可以像您想象的那样工作:

  #include< algorithm> 
#include< iostream>
#include< complex>
#include< cublas_v2.h>

使用命名空间std;

int main()
{
int xmax = 100;
complex< double> u [xmax] [xmax];
double arrSize = sizeof(complex< double>)* xmax * xmax;

fill(& u [0] [0],& u [0] [0] +(xmax * xmax),复数(1.0,1.0));
u [49] [51] + =复杂double(665.0,665.0);
u [51] [49] * = 2.0;

cout<< 之前:<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔

complex< double> alpha(1.0,0.0);
complex< double> beta(0.0,0.0);
cublasHandle_t句柄;
cublasCreate(& handle);

cuDoubleComplex * d_u;
cuDoubleComplex * d_v;
cuDoubleComplex * _alpha = reinterpret_cast< cuDoubleComplex *>(& alpha);
cuDoubleComplex * _beta = reinterpret_cast< cuDoubleComplex *>(& beta);
cudaMalloc(& d_u,arrSize);
cudaMalloc(& d_v,arrSize);
cudaMemcpy(d_u,& u [0] [0],arrSize,cudaMemcpyHostToDevice);
cublasZgeam(句柄,CUBLAS_OP_T,CUBLAS_OP_N,xmax,xmax,
_alpha,d_u,xmax,
_beta,d_u,xmax,
d_v,xmax);

cudaMemcpy(u,d_v,arrSize,cudaMemcpyDeviceToHost);

cout<< 之后:<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔

返回0;
}

构建并运行如下:

 〜/ SO $ nvcc -std = c ++ 11 -arch = sm_52 -o complex_transpose complex_transpose.cu -lcublas 
〜/ SO $ ./complex_transpose
之前:
(666,666)
(2,2)
之后:
(2,2)
(666,666)

唯一需要的修改是 std :: complex< double> 类型为 cuDoubleComplex 。这样做,一切都会按预期工作。



使用推力,代码看起来几乎相同:

  #include< iostream> 
#include< thrust / fill.h>
#include< thrust / complex.h>
#include< cublas_v2.h>

使用命名空间std;

int main()
{
int xmax = 100;
推力:: complex< double> u [xmax] [xmax];
double arrSize = sizeof(thrust :: complex< double>)* xmax * xmax;

推力:: fill(& u [0] [0],& u [0] [0] +(xmax * xmax),推力:: complex< double>(1.0,1.0 ));
u [49] [51] + =推力:: complex< double>(665.0,665.0);
u [51] [49] * = 2.0;

cout<< 之前:<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔

推力:: complex< double> alpha(1.0,0.0);
推力:: complex< double> beta(0.0,0.0);
cublasHandle_t句柄;
cublasCreate(& handle);

cuDoubleComplex * d_u;
cuDoubleComplex * d_v;
cuDoubleComplex * _alpha = reinterpret_cast< cuDoubleComplex *>(& alpha);
cuDoubleComplex * _beta = reinterpret_cast< cuDoubleComplex *>(& beta);
cudaMalloc(& d_u,arrSize);
cudaMalloc(& d_v,arrSize);
cudaMemcpy(d_u,& u [0] [0],arrSize,cudaMemcpyHostToDevice);
cublasZgeam(句柄,CUBLAS_OP_T,CUBLAS_OP_N,xmax,xmax,
_alpha,d_u,xmax,
_beta,d_u,xmax,
d_v,xmax);

cudaMemcpy(u,d_v,arrSize,cudaMemcpyDeviceToHost);

cout<< 之后:<<恩德尔
cout<< u [49] [51]<<恩德尔
cout<< u [51] [49]<<恩德尔

返回0;
}

也许更接近用例,使用带有内核性能的推力设备容器在CUBLAS调用之前进行一些初始化:

  #include< iostream> 
#include< thrust / device_vector.h>
#include< thrust / complex.h>
#include< thrust / execution_policy.h>
#include< thrust / copy.h>
#include< cublas_v2.h>

__global__ void setup_kernel(thrust :: complex< double> u,int xmax)
{
u [51 + 49 * xmax] + =推力:: complex< double> (665.0,665.0);
u [49 + 51 * xmax] * = 2.0;
}

int main()
{
int xmax = 100;

推力:: complex< double> alpha(1.0,0.0);
推力:: complex< double> beta(0.0,0.0);
cublasHandle_t句柄;
cublasCreate(& handle);

推力:: device_vector<推力:: complex< double>> d_u(xmax * xmax,推力:: complex< double>(1.0,1.0));
推力:: device_vector<推力:: complex< double>> d_v(xmax * xmax,推力:: complex double(0.,0。));
setup_kernel<< 1,1>>(thrust :: raw_pointer_cast(d_u.data()),xmax);

cuDoubleComplex * _d_u = reinterpret_cast< cuDoubleComplex *>(thrust :: raw_pointer_cast(d_u.data()));
cuDoubleComplex * _d_v = reinterpret_cast< cuDoubleComplex *>(thrust :: raw_pointer_cast(d_v.data()));
cuDoubleComplex * _alpha = reinterpret_cast< cuDoubleComplex *>(& alpha);
cuDoubleComplex * _beta = reinterpret_cast< cuDoubleComplex *>(& beta);

cublasZgeam(句柄,CUBLAS_OP_T,CUBLAS_OP_N,xmax,xmax,
_alpha,_d_u,xmax,
_beta,_d_u,xmax,
_d_v,xmax);

推力:: complex< double> u [xmax] [xmax];

推力::复制(d_u.begin(),d_u.end()和u [0] [0]);
std :: cout<< 之前:<< std :: endl;
std :: cout<< u [49] [51]<< std :: endl;
std :: cout<< u [51] [49]<< std :: endl;

推力::复制(d_v.begin(),d_v.end()和u [0] [0]);
std :: cout<< 之后:<< std :: endl;
std :: cout<< u [49] [51]<< std :: endl;
std :: cout<< u [51] [49]<< std :: endl;

返回0;

}


In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.

Using complex numbers from cuComplex.h is not a preferable option since I do a lot of arithmetic on the array and cuComplex doesnt have defined operators such as * +=.

This is how I defined array which I want to transpose

thrust::complex<float> u[xmax][xmax];

I have found this https://github.com/jtravs/cuda_complex, but using it as such:

#include "cuComplex.hpp"

doesnt allow me to use mentioned operators when compiled with nvcc

error: no operator "+=" matches these operands
        operand types are: cuComplex += cuComplex

Is there some solution to this? Code from github is old and there may lay the issue or maybe I am using it wrong

EDIT: Here is code which works, only difference from talonmies code is adding simple kernel and pointer to same data but being thrust::complex

#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>

using namespace std;

__global__ void test(thrust::complex<double>* u) {

  u[0] += thrust::complex<double>(3.3,3.3);
}

int main()
{
  int xmax = 100;
  thrust::complex<double>  u[xmax][xmax];
  double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;

  thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
  u[49][51] += thrust::complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;
  cout << u[0][0] << endl;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);
  test<<<1,1>>>(d_vTest);
  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
  cout << "After:" << endl;
  cout << u[0][0] << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

解决方案

Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.

A simple modification to the code you posted in comments works exactly as you might imagine:

#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>

using namespace std;

int main()
{
  int xmax = 100;
  complex<double>  u[xmax][xmax];
  double arrSize = sizeof(complex<double>) * xmax * xmax;

  fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
  u[49][51] += complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  complex<double> alpha(1.0, 0.0);
  complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);

  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

built and run like so:

~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose 
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)

The only modifications required are explicit casts of the std::complex<double> types to cuDoubleComplex. Do that and everything works as expected.

Use thrust, the code looks almost identical:

#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>

using namespace std;

int main()
{
  int xmax = 100;
  thrust::complex<double>  u[xmax][xmax];
  double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;

  thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
  u[49][51] += thrust::complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);

  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>

__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
  u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
  u[49 + 51*xmax] *= 2.0;
}

int main()
{
  int xmax = 100;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
  thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
  setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);

  cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
  cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);

  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, _d_u, xmax,
                  _beta, _d_u, xmax,
                  _d_v, xmax);

  thrust::complex<double>  u[xmax][xmax];

  thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
  std::cout << "Before:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
  std::cout << "After:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  return 0;

}

这篇关于将cuBLAS与Thrust中的复数一起使用的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆