逐个元素矢量乘法与CUDA [英] Element-by-element vector multiplication with CUDA

查看:221
本文介绍了逐个元素矢量乘法与CUDA的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我已经在CUDA中构建了一个基本的内核,以进行两个复数向量的元素向量向量向量乘法。内核代码插入到下面( multiplyElementwise )。它工作正常,但由于我注意到其他看似直截了当的操作(如缩放矢量)在像CUBLAS或CULA这样的库中优化,我想知道是否可以通过库调用替换我的代码?令我吃惊的是,CUBLAS和CULA都没有这个选项,我试图通过使一个向量为对角矩阵向量积的对角线来伪造它,但结果真的很慢。



最后,我试图自己优化这个代码(见下面的 multiplyElementwiseFast



所以我的问题:


  1. 有元素向量矢量乘法的库吗?​​

  2. 如果没有,我可以加速我的代码( multiplyElementwise )?

任何帮助将非常感谢!

  __ global__ void multiplyElementwise(cufftComplex * f0,cufftComplex * f1,int size)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
if(i< size)
{
float a,b,c,d;
a = f0 [i] .x;
b = f0 [i] .y;
c = f1 [i] .x;
d = f1 [i] .y;
float k;
k = a *(c + d);
d = d *(a + b);
c = c *(b - a);
f0 [i] .x = k-d;
f0 [i] .y = k + c;
}
}

__global__ void multiplyElementwiseFast(cufftComplex * f0,cufftComplex * f1,int size)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i <4 * size)
{
const int N = 256;
const int thId = threadIdx.x / 4;
const int rem4 = threadIdx.x%4;
const int i4 = i / 4;

__shared__ float a [N];
__shared__ float b [N];
__shared__ float c [N];
__shared__ float d [N];
__shared__ float Re [N];
__shared__ float Im [N];

if(rem4 == 0)
{
a [thId] = f0 [i4] .x;
Re [thId] = 0.f;
}
if(rem4 == 1)
{
b [thId] = f0 [i4] .y;
Im [thId] = 0.f;
}
if(rem4 == 2)
c [thId] = f1 [i4] .x;
if(rem4 == 0)
d [thId] = f1 [i4] .y;
__syncthreads();

if(rem4 == 0)
atomicAdd(&(Re [thId]),a [thId] * c [thId]);
if(rem4 == 1)
atomicAdd(&(Re [thId]),-b [thId] * d [thId]);
if(rem4 == 2)
atomicAdd(&(Im [thId]),b [thId] * c [thId]);
if(rem4 == 3)
atomicAdd(&(Im [thId]),a [thId] * d [thId]);
__syncthreads();

if(rem4 == 0)
f0 [i4] .x = Re [thId];
if(rem4 == 1)
f0 [i4] .y = Im [thId];
}
}


解决方案

你试图实现的是一个简单的元素方面的复杂数字产品,你似乎正在做一些额外的步骤在 multiplyElementwise 内核,增加寄存器使用。您尝试计算的是:

  f0 [i] .x = a * c  -  b * d; 
f0 [i] .y = a * d + b * c;

因为(a + ib)*(c + id)= * c-b * d)+ i(a * d + b * c)。通过使用你改进的复数乘法,你交易1乘法3加法和一些额外的寄存器。这是否可以证明是否可能取决于您使用的硬件。例如,如果您的硬件支持 FMA (Fused Multiply-Add)那种优化可能不是有效的。您应该考虑阅读本文档:精度与性能:
NVIDIA GPU的浮点和IEEE 754兼容性
,这也解决了浮点精度的问题。



不过,您应该考虑使用 推力 。这个库提供了许多高级工具来操作主机和设备向量。您可以在这里查看一大堆示例: https://github.com/thrust/thrust / tree / master / examples



更新代码



在您的情况下,您可以使用此示例,并将其修改为以下内容:

  #include< thrust / host_vector.h> 
#include< thrust / device_vector.h>
#include< time.h>

struct ElementWiseProductBasic:public thrust :: binary_function< float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1,const float2& v2)const
{
float2 res;
res.x = v1.x * v2.x-v1.y * v2.y;
res.y = v1.x * v2.y + v1.y * v2.x;
return res;
}
};

/ **
*查看:http://www.embedded.com/design/embedded/4007256/Digital-Signal-Processing-Tricks--Fast-multiplication-of-complex -numbers%5D
* /
struct ElementWiseProductModified:public thrust :: binary_function< float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1,const float2& v2)const
{
float2 res;
float a,b,c,d,k;
a = v1.x;
b = v1.y;
c = v2.x;
d = v2.y;
k = a *(c + d);
d = d *(a + b);
c = c *(b - a);
res.x = k -d;
res.y = k + c;
return res;
}
};

int get_random_int(int min,int max)
{
return min +(rand()%(int)(max - min + 1));
}

thrust :: host_vector< float2> init_vector(const size_t N)
{
thrust :: host_vector< float2> temp(N);
for(size_t i = 0; i {
temp [i] .x = get_random_int(0,10)
temp [i] .y = get_random_int(0,10);
}
return temp;
}

int main(void)
{
const size_t N = 100000;
const bool compute_basic_product = true;
const bool compute_modified_product = true;

srand(time(NULL));

thrust :: host_vector< float2> h_A = init_vector(N);
thrust :: host_vector< float2> h_B = init_vector(N);
thrust :: device_vector< float2> d_A = h_A;
thrust :: device_vector< float2> d_B = h_B;

thrust :: host_vector< float2> h_result(N);
thrust :: host_vector< float2> h_result_modified(N);

if(compute_basic_product)
{
thrust :: device_vector< float2> d_result(N);

thrust :: transform(d_A.begin(),d_A.end(),
d_B.begin(),d_result.begin(),
ElementWiseProductBasic
h_result = d_result;
}

if(compute_modified_product)
{
thrust :: device_vector< float2> d_result_modified(N);

thrust :: transform(d_A.begin(),d_A.end(),
d_B.begin(),d_result_modified.begin(),
ElementWiseProductModified
h_result_modified = d_result_modified;
}

std :: cout<< std :: fixed;
for(size_t i = 0; i <4; i ++)
{
float2 a = h_A [i];
float2 b = h_B [i];

std :: cout<< (< std :: cout<< *;
std :: cout<< (<< bx<<,<< b.y<<);

if(compute_basic_product)
{
float2 prod = h_result [i];
std :: cout<< =;
std :: cout<< (<< prod.x<<,<< prod.y<<);
}

if(compute_modified_product)
{
float2 prod_modified = h_result_modified [i];
std :: cout<< =;
std :: cout<< (<<< prod_modified.x<<,<< prod_modified.y<<);
}
std :: cout<< std :: endl;
}

return 0;
}

这会返回:

 (6.000000,5.000000)*(0.000000,1.000000)=(-5.000000,6.000000)
(3.000000,2.000000)*(0.000000,4.000000)=(-8.000000,12.000000) )
(2.000000,10.000000)*(10.000000,4.000000)=(-20.000000,108.000000)
(4.000000,8.000000)*(10.000000,9.000000)=(-32.000000,116.000000)

然后,您可以比较两种不同的乘法策略的时序,并选择最适合您的硬件。


I have build a rudimentary kernel in CUDA to do an elementwise vector-vector multiplication of two complex vectors. The kernel code is inserted below (multiplyElementwise). It works fine, but since I noticed that other seemingly straightforward operations (like scaling a vector) are optimized in libraries like CUBLAS or CULA, I was wondering if it is possible to replace my code by a library call? To my surprise, neither CUBLAS nor CULA have this option, I tried to fake it by making one of the vectors the diagonal of a diagonal matrix-vector product, but the result was really slow.

As a matter of last resort I tried to optimize this code myself (see multiplyElementwiseFast below) by loading the two vectors in shared memory and then work from there, but that was slower than my original code.

So my questions:

  1. Is there library that does elementwise vector-vector multiplications?
  2. If not, can I accelerate my code (multiplyElementwise)?

Any help would be greatly appreciated!

__global__ void multiplyElementwise(cufftComplex* f0, cufftComplex* f1, int size)
{
    const int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < size)
    {
        float a, b, c, d;
        a = f0[i].x; 
        b = f0[i].y;
        c = f1[i].x; 
        d = f1[i].y;
        float k;
        k = a * (c + d);
        d =  d * (a + b);
        c =  c * (b - a);
        f0[i].x = k - d;
        f0[i].y = k + c;
    }
}

__global__ void multiplyElementwiseFast(cufftComplex* f0, cufftComplex* f1, int size)
{
    const int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < 4*size)
    {
        const int N = 256;
        const int thId = threadIdx.x / 4;
        const int rem4 = threadIdx.x % 4;
        const int i4 = i / 4;

        __shared__ float a[N];
        __shared__ float b[N];
        __shared__ float c[N];
        __shared__ float d[N];
        __shared__ float Re[N];
        __shared__ float Im[N];

        if (rem4 == 0)
        {
            a[thId] = f0[i4].x;
            Re[thId] = 0.f;
        }
        if (rem4 == 1)
        {
            b[thId] = f0[i4].y;
            Im[thId] = 0.f;
        }
        if (rem4 == 2)
            c[thId] = f1[i4].x;
        if (rem4 == 0)
            d[thId] = f1[i4].y;
        __syncthreads();

        if (rem4 == 0)
            atomicAdd(&(Re[thId]), a[thId]*c[thId]);        
        if (rem4 == 1)
            atomicAdd(&(Re[thId]), -b[thId]*d[thId]);
        if (rem4 == 2)
            atomicAdd(&(Im[thId]), b[thId]*c[thId]);
        if (rem4 == 3)
            atomicAdd(&(Im[thId]), a[thId]*d[thId]);
        __syncthreads();

        if (rem4 == 0)
            f0[i4].x = Re[thId];
        if (rem4 == 1)
            f0[i4].y = Im[thId];
    }
}        

解决方案

If what you are trying to achieve is a simple element-wise product with complex numbers, you do seem to be doing some extra steps in your multiplyElementwise kernel that increase register usage. What you try to compute is:

f0[i].x = a*c - b*d;
f0[i].y = a*d + b*c;

since (a + ib)*(c + id) = (a*c - b*d) + i(a*d + b*c). By using your improved complex multiplication, you're trading 1 multiplication for 3 additions and some extra registers. Whether this can be justified or not might depend on the hardware you're using. For instance, if your hardware supports FMA (Fused Multiply-Add), that kind of optimization may not be efficient. You should consider reading this document: "Precision & Performance: Floating Point and IEEE 754 Compliance for NVIDIA GPUs" which also tackles the issue of floating-point precision.

Still, you should consider using Thrust. This library offers many high-level tools to operate on both host and device vectors. You can see a long list of examples here: https://github.com/thrust/thrust/tree/master/examples. This would make your life a lot easier.

UPDATED CODE

In your case, you could use this example and adapt it to something like this:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <time.h>

struct ElementWiseProductBasic : public thrust::binary_function<float2,float2,float2>
{
    __host__ __device__
    float2 operator()(const float2& v1, const float2& v2) const
    {
        float2 res;
        res.x = v1.x * v2.x - v1.y * v2.y;
        res.y = v1.x * v2.y + v1.y * v2.x;
        return res;
    }
};

/**
 * See: http://www.embedded.com/design/embedded/4007256/Digital-Signal-Processing-Tricks--Fast-multiplication-of-complex-numbers%5D
 */
struct ElementWiseProductModified : public thrust::binary_function<float2,float2,float2>
{
    __host__ __device__
    float2 operator()(const float2& v1, const float2& v2) const
    {
        float2 res;
        float a, b, c, d, k;
        a = v1.x;
        b = v1.y;
        c = v2.x;
        d = v2.y;
        k = a * (c + d);
        d =  d * (a + b);
        c =  c * (b - a);
        res.x = k -d;
        res.y = k + c;
        return res;
    }
};

int get_random_int(int min, int max)
{
    return min + (rand() % (int)(max - min + 1));
}

thrust::host_vector<float2> init_vector(const size_t N)
{
    thrust::host_vector<float2> temp(N);
    for(size_t i = 0; i < N; i++)
    {
        temp[i].x = get_random_int(0, 10);
        temp[i].y = get_random_int(0, 10);
    }
    return temp;
}

int main(void)
{
    const size_t N = 100000;
    const bool compute_basic_product    = true;
    const bool compute_modified_product = true;

    srand(time(NULL));

    thrust::host_vector<float2>   h_A = init_vector(N);
    thrust::host_vector<float2>   h_B = init_vector(N);
    thrust::device_vector<float2> d_A = h_A;
    thrust::device_vector<float2> d_B = h_B;

    thrust::host_vector<float2> h_result(N);
    thrust::host_vector<float2> h_result_modified(N);

    if (compute_basic_product)
    {
        thrust::device_vector<float2> d_result(N);

        thrust::transform(d_A.begin(), d_A.end(),
                          d_B.begin(), d_result.begin(),
                          ElementWiseProductBasic());
        h_result = d_result;
    }

    if (compute_modified_product)
    {
        thrust::device_vector<float2> d_result_modified(N);

        thrust::transform(d_A.begin(), d_A.end(),
                          d_B.begin(), d_result_modified.begin(),
                          ElementWiseProductModified());
        h_result_modified = d_result_modified;
    }

    std::cout << std::fixed;
    for (size_t i = 0; i < 4; i++)
    {
        float2 a = h_A[i];
        float2 b = h_B[i];

        std::cout << "(" << a.x << "," << a.y << ")";
        std::cout << " * ";
        std::cout << "(" << b.x << "," << b.y << ")";

        if (compute_basic_product)
        {
            float2 prod = h_result[i];
            std::cout << " = ";
            std::cout << "(" << prod.x << "," << prod.y << ")";
        }

        if (compute_modified_product)
        {
            float2 prod_modified = h_result_modified[i];
            std::cout << " = ";
            std::cout << "(" << prod_modified.x << "," << prod_modified.y << ")";
        }
        std::cout << std::endl;
    }   

    return 0;
}

This returns:

(6.000000,5.000000)  * (0.000000,1.000000)  = (-5.000000,6.000000)
(3.000000,2.000000)  * (0.000000,4.000000)  = (-8.000000,12.000000)
(2.000000,10.000000) * (10.000000,4.000000) = (-20.000000,108.000000)
(4.000000,8.000000)  * (10.000000,9.000000) = (-32.000000,116.000000)

You can then compare the timings of the two different multiplication strategies and choose what's best with your hardware.

这篇关于逐个元素矢量乘法与CUDA的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆