软阈值在CUDA [英] Soft thresholding in CUDA

查看:160
本文介绍了软阈值在CUDA的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个数组 1024 元素,我有一个阈值 t> 0 / p>

I have an array with 1024 elements and I have a threshold t>0 to do the following operations

A[i] = A[i] - t  // if  A[i] > t
A[i] = A[i] + t  // if A[i] < -t
A[i] = 0         // otherwise

简单内核:

void __global__  Kernel_Shrinkage(float* A, float t, int n){
    int i = blockIdx*blockDim.x + threadIdx.x;
    while( i < n)
    { 
        float temp = A[i];
        if (fabs(temp) > t)
        {
            if (temp < 0)
            {
                A[i] += t;
            } 
            else
                A[i] -= t;
        } 
        else
            A[i] = 0;
        id += gridDim.x * blockDim.x;}}

与CPU实现相比。因为这个内核有许多控制语句,我怀疑有一种方法来优化这个内核。任何帮助?

It shows 6x speed up as compared with a CPU implementation. Because this kernel has many control statements, I suspect there is a way to optimize this kernel. Any help?

如果我有前提条件,数组是稀疏的,这意味着它的大多数元素 0

And if I have the prerequisite that the array is sparse which means that the most elements in it are 0?

推荐答案

您的算法是软阈值问题的实施,因为它是计算:

Your algorithm is an implementation of the soft thresholding problem since it is computing:

>

这个问题已经面临:软阈值CUDA实现

This problem has been already faced in: Soft Thresholding CUDA implementation, where two solutions have been devised.

通过下面的代码,我将比较两个提到的解决方案和由Robert Crovella提供的解决方案。

By the code below, I'm comparing the two mentioned solution and the solution provided by Robert Crovella.

CODE (为了简洁,CUDA错误检查被忽略,但始终添加)

CODE (CUDA error check neglected, for the sake of brevity, but ALWAYS add it)

#include <thrust\device_vector.h>
#include <time.h>       /* time */

#define BLOCKSIZE 256

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*****************/
/* TEST KERNEL 1 */
/*****************/
__global__ void myKernel1(float* __restrict__ x, float lambda, const int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        float xa = fabs(x[tid]); 
        x[tid] = (xa > lambda) ? x[tid] * ((xa - lambda) / xa) : 0;
    }

}    

/*****************/
/* TEST KERNEL 2 */
/*****************/
__global__ void myKernel2(float* __restrict__ x, float lambda, const int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        float xa = fabs(x[tid]); 
        x[tid] = signbit(lambda-xa)*copysign(xa-lambda,x[tid]);
    }

}

/*****************/
/* TEST KERNEL 3 */
/*****************/
__global__ void myKernel3(float* __restrict__ x, float lambda, const int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        float temp = x[tid];
        if (temp > lambda)
            temp -= lambda;
        else if (-temp > lambda)
            temp += lambda;
        else 
            temp = 0;
        x[tid] = temp;    }

}

/********/
/* MAIN */
/********/
void main() {

    const int N = 10000000;

    const float lambda = 0.3f;

    thrust::host_vector<float> h_data(N);
    srand (time(NULL));
    for (int i=0; i<N; i++) h_data[i] = rand()/RAND_MAX - 0.5f;

    thrust::device_vector<float> d_data(h_data);

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    myKernel1<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(thrust::raw_pointer_cast(d_data.data()), lambda, N);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time kernel 1:  %3.3f ms \n", time);

    d_data = h_data;

    cudaEventRecord(start, 0);

    myKernel2<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(thrust::raw_pointer_cast(d_data.data()), lambda, N);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time kernel 2:  %3.3f ms \n", time);

    d_data = h_data;

    cudaEventRecord(start, 0);

    myKernel3<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(thrust::raw_pointer_cast(d_data.data()), lambda, N);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time kernel 3:  %3.3f ms \n", time);

}

COMPUTATIONAL RESULTS CUDA 6.0)

COMPUTATIONAL RESULTS (Tested with CUDA 6.0)

GT540M

GT540M

Kernel 1    47.9ms
Kernel 2    41.5ms
Kernel 3    42.4ms

em> K20c

K20c

Kernel 1    0.676ms
Kernel 2    0.591ms
Kernel 3    0.592ms

这篇关于软阈值在CUDA的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆