通过CUDA推力进行减速 [英] Strided reduction by CUDA Thrust

查看：159 发布时间：2017/3/4 15:21:50 cuda thrust

本文介绍了通过CUDA推力进行减速的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有一个具有这种结构的顶点数组：

[x0，y0，z0，empty float，x1，y1， z1，empty float，x2，y2，z2，empty float，...]

 
 
我需要找到 minX ， minY ， minZ ， maxX ， maxY 和 maxZ 我写了一个适当的还原算法，但它发生的有点太慢。我决定使用THRUST库。有一个高度优化的 reduce（），或者甚至更好的 minmax_element（）和min的数组同时，但我找不到一个快速的方式使用，然后只有每个 4 索引。将数据复制到 3 分隔的数组不是我正在寻找的解决方案。
 
 
  方法（使用Thrust迭代器或类似方法的某种技巧）向 reduce（）？ 
传递一个步幅
 
解决方案
您可以使用 stride range method ，以及3次 thrust :: minmax_element  
 
 
 这是一个工作示例：
  $ cat t491.cu 
 #include< thrust / device_vector.h> 
 #include< thrust / host_vector.h> 
 #include< iostream> 
 #include< thrust / copy.h> 
 #include< thrust / iterator / permutation_iterator.h> 
 #include< thrust / iterator / counting_iterator.h> 
 #include< thrust / iterator / transform_iterator.h> 
 #include< thrust / functional.h> 
 #include< thrust / extrema.h> 
 #include< thrust / transform_reduce.h> 
 
 #define DSIZE（1048576 * 2）
 #define SSIZE 4 
 #define FSIZE（DSIZE * SSIZE）
 #define nTPB 256 
 #define BSIZE nTPB 
 #define nBLKS 64 
 #define FLOAT_MIN（-99999）
 #define FLOAT_MAX 99999 
 
 
 typedef thrust :: tuple< float，float ，float，float，float，float> tpl6; 
 
 struct expand_functor 
 {
 __host__ __device__ 
 tpl6 operator（）（const float4 a）{
 tpl6 result; 
 result.get< 0>（）= a.x; 
 result.get< 1>（）= a.x; 
 result.get< 2>（）= a.y; 
 result.get< 3>（）= a.y; 
 result.get< 4>（）= a.z; 
 result.get< 5>（）= a.z; 
 return result; 
} 
}; 
 
 
 struct minmax3_functor 
 {
 __host__ __device__ 
 tpl6 operator（）（const tpl6 a，const tpl6 b）{
 tpl6 result; 
 result.get< 0>（）=（a.get< 0>（）< b.get< 0> a.get< 0>（）：b.get< 0>（）; 
 result.get< 1>（）=（a.get 1（）> b.get< 1& a.get 1（）：b.get 1（）; 
 result.get< 2>（）=（a.get 2（）< b.get 2（））？ a.get 2（）：b.get 2（）; 
 result.get< 3>（）=（a.get 3（）> b.get 3（））？ a.get 3（）：b.get 3（）; 
 result.get< 4>（）=（a.get 4（）< b.get 4（））？ a.get 4（）：b.get 4（）; 
 result.get< 5>（）=（a.get 5（）> b.get 5（））？ a.get 5（）：b.get 5（）; 
 return result; 
} 
}; 
 
 __device__ int bcount = 0; 
 __device__ float xmins [nBLKS]; 
 __device__ float xmaxs [nBLKS]; 
 __device__ float ymins [nBLKS]; 
 __device__ float ymaxs [nBLKS]; 
 __device__ float zmins [nBLKS]; 
 __device__ float zmaxs [nBLKS]; 
 
 __global__ void my_minmax3（float4 * data，float * results，size_t dsize）{
 //假设2的线程块大小
 //假设nBLKS <= nTPB ，nBLKS也是2的幂b $ b __shared__ float xmin [BSIZE]，xmax [BSIZE]，ymin [BSIZE]，ymax [BSIZE]，zmin [BSIZE]，zmax [BSIZE]; 
 __shared__ int lblock; 
 
 float my_xmin = FLOAT_MAX; 
 float my_ymin = FLOAT_MAX; 
 float my_zmin = FLOAT_MAX; 
 float my_xmax = FLOAT_MIN; 
 float my_ymax = FLOAT_MIN; 
 float my_zmax = FLOAT_MIN; 
 int idx = threadIdx.x + blockDim.x * blockIdx.x; 
 while（idx< dsize）{
 float4 my_temp = data [idx]; 
 if（my_xmin> my_temp.x）my_xmin = my_temp.x; 
 if（my_ymin> my_temp.y）my_ymin = my_temp.y; 
 if（my_zmin> my_temp.z）my_zmin = my_temp.z; 
 if（my_xmax< my_temp.x）my_xmax = my_temp.x; 
 if（my_ymax< my_temp.y）my_ymax = my_temp.y; 
 if（my_zmax< my_temp.z）my_zmax = my_temp.z; 
 idx + = blockDim.x * gridDim.x;} 
 xmin [threadIdx.x] = my_xmin; 
 ymin [threadldx.x] = my_ymin; 
 zmin [threadIdx.x] = my_zmin; 
 xmax [threadIdx.x] = my_xmax; 
 ymax [threadIdx.x] = my_ymax; 
 zmax [threadIdx.x] = my_zmax; 
 __syncthreads（）; 
 for（int i = blockDim.x / 2; i> 0; i>> = 1）{
 if（threadIdx.x< i）{
 if（xmin [ threadIdx.x]> xmin [threadIdx.x + i]）xmin [threadIdx.x] = xmin [threadIdx.x + i]; 
 if（ymin [threadIdx.x]> ymin [threadIdx.x + i]）ymin [threadIdx.x] = ymin [threadIdx.x + i]; 
 if（zmin [threadIdx.x]> zmin [threadIdx.x + i]）zmin [threadIdx.x] = zmin [threadIdx.x + i]; 
 if（xmax [threadIdx.x]< xmax [threadIdx.x + i]）xmax [threadIdx.x] = xmax [threadIdx.x + i]; 
 if（ymax [threadIdx.x]< ymax [threadIdx.x + i]）ymax [threadIdx.x] = ymax [threadIdx.x + i]; 
 if（zmax [threadIdx.x]< zmax [threadIdx.x + i]）zmax [threadIdx.x] = zmax [threadIdx.x + i]; 
} 
 __syncthreads（）; 
} 
 if（！threadIdx.x）{
 xmins [blockIdx.x] = xmin [0]; 
 xmaxs [blockldx.x] = xmax [0]; 
 ymins [blockIdx.x] = ymin [0]; 
 ymaxs [blockIdx.x] = ymax [0]; 
 zmins [blockIdx.x] = zmin [0]; 
 zmaxs [blockIdx.x] = zmax [0]; 
 __threadfence（）; 
 if（atomicAdd（& bcount，1）==（nBLKS-1））lblock = 1; 
 else lblock = 0; 
} 
 __syncthreads（）; 
 if（lblock）{// last block does final reduction 
 if（threadIdx.x< nBLKS）{
 xmin [threadIdx.x] = xmins [threadIdx.x]; 
 xmax [threadIdx.x] = xmaxs [threadIdx.x]; 
 ymin [threadIdx.x] = ymins [threadIdx.x]; 
 ymax [threadIdx.x] = ymaxs [threadIdx.x]; 
 zmin [threadIdx.x] = zmins [threadIdx.x]; 
 zmax [threadIdx.x] = zmaxs [threadIdx.x];} 
 __syncthreads（）; 
 for（int i = nBLKS / 2; i> 0; i>> = 1）{
 if（threadIdx.x< i）{
 if（xmin [threadIdx。 xmin] xmin [threadIdx.x + i]）xmin [threadIdx.x] = xmin [threadIdx.x + i]; 
 if（ymin [threadIdx.x]> ymin [threadIdx.x + i]）ymin [threadIdx.x] = ymin [threadIdx.x + i]; 
 if（zmin [threadIdx.x]> zmin [threadIdx.x + i]）zmin [threadIdx.x] = zmin [threadIdx.x + i]; 
 if（xmax [threadIdx.x]< xmax [threadIdx.x + i]）xmax [threadIdx.x] = xmax [threadIdx.x + i]; 
 if（ymax [threadIdx.x]< ymax [threadIdx.x + i]）ymax [threadIdx.x] = ymax [threadIdx.x + i]; 
 if（zmax [threadIdx.x]< zmax [threadIdx.x + i]）zmax [threadIdx.x] = zmax [threadIdx.x + i]; 
} 
 __syncthreads（）; 
} 
 if（！threadIdx.x）{
 results [0] = xmin [0]; 
 results [1] = xmax [0]; 
 results [2] = ymin [0]; 
 results [3] = ymax [0]; 
 results [4] = zmin [0]; 
 results [5] = zmax [0]; 
} 
 
} 
} 
 
 template< typename Iterator> 
 class strided_range 
 {
 public：
 
 typedef typename thrust :: iterator_difference< Iterator> :: type difference_type; 
 
 struct stride_functor：public thrust :: unary_function< difference_type，difference_type> 
 {
 difference_type stride; 
 
 stride_functor（difference_type stride）
：stride（stride）{} 
 
 __host__ __device__ 
 difference_type运算符（const difference_type& i）const 
 {
 return stride * i; 
} 
}; 
 
 typedef typename thrust :: counting_iterator< difference_type>计数迭代器; 
 typedef typename thrust :: transform_iterator< stride_functor，CountingIterator> TransformIterator; 
 typedef typename thrust :: permutation_iterator< Iterator，TransformIterator> PermutationIterator; 
 
 // strided_range迭代器的类型
 typedef PermutationIterator iterator; 
 
 //为范围构造strided_range [first，last）
 strided_range（迭代器第一，迭代器最后，差异型stride）
：第一个，第一个， stride（stride）{} 
 
迭代器begin（void）const 
 {
 return PermutationIterator（first，TransformIterator（CountingIterator（0），stride_functor（stride）））; 
} 
 
迭代器结束（void）const 
 {
 return begin（）+（（last- first）+（stride-1））/ stride; 
} 
 
 protected：
迭代器先; 
迭代器最后; 
 difference_type stride; 
}; 
 
 typedef thrust :: device_vector< float> :: iterator Iter; 
 typedef strided_range< Iter> :: iterator sIter; 
 
 int main（）{
 //设置测试数据
 cudaEvent_t start，stop; 
 float et; 
 cudaEventCreate（& start）; cudaEventCreate（& stop）; 
 thrust :: host_vector< float> h_vals（FSIZE）; 
 for（int i = 0; i  h_vals [i * SSIZE + 0] = rand（）/（float）RAND_MAX; 
 h_vals [i * SSIZE + 1] = rand（）/（float）RAND_MAX; 
 h_vals [i * SSIZE + 2] = rand（）/（float）RAND_MAX; 
 h_vals [i * SSIZE + 3] = 0.0f;} 
 thrust :: device_vector< float> d_vals = h_vals; 
 //设置strided范围
 strided_range< Iter> item_x（d_vals.begin（），d_vals.end（），SSIZE）; 
 strided_range< Iter> item_y（d_vals.begin（）+ 1，d_vals.end（），SSIZE）; 
 strided_range< Iter> item_z（d_vals.begin（）+ 2，d_vals.end（），SSIZE）; 
 //找到最小和最大
 cudaEventRecord（start）; 
 thrust :: pair< sIter，sIter> result_x = thrust :: minmax_element（item_x.begin（），item_x.end（））; 
 thrust :: pair< sIter，sIter> result_y = thrust :: minmax_element（item_y.begin（），item_y.end（））; 
 thrust :: pair< sIter，sIter> result_z = thrust :: minmax_element（item_z.begin（），item_z.end（））; 
 cudaEventRecord（stop）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& et，start，stop）; 
 std :: cout<< 推力经过时间：< et-< ms< std :: endl; 
 std :: cout<< 推力结果：< std :: endl; 
 std :: cout<< x min element =< *（result_x.first）<< std :: endl; 
 std :: cout<< x max element =<< *（result_x.second）<< std :: endl; 
 std :: cout<< y min element =< *（result_y.first）<< std :: endl; 
 std :: cout<< y max element =<< *（result_y.second）<< std :: endl; 
 std :: cout<< z min element =< *（result_z.first）<< std :: endl; 
 std :: cout<< z max element =<< *（result_z.second）<< std :: endl; 
 
 float * h_results，* d_results; 
 h_results = new float [6]; 
 cudaMalloc（& d_results，6 * sizeof（float））; 
 cudaEventRecord（start）; 
 my_minmax3<<< nBLKS，nTPB>>（（float4 *）thrust :: raw_pointer_cast（d_vals.data（）），d_results，DSIZE）; 
 cudaEventRecord（stop）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& et，start，stop）; 
 cudaMemcpy（h_results，d_results，6 * sizeof（float），cudaMemcpyDeviceToHost）; 
 std :: cout<< kernel elapsed time：< et-< ms< std :: endl; 
 std :: cout<< 内核结果：< std :: endl; 
 std :: cout<< x min element =< h_results [0]<< std :: endl; 
 std :: cout<< x max element =<< h_results [1]< std :: endl; 
 std :: cout<< y min element =< h_results [2]< std :: endl; 
 std :: cout<< y max element =<< h_results [3]< std :: endl; 
 std :: cout<< z min element =< h_results [4]< std :: endl; 
 std :: cout<< z max element =<< h_results [5]< std :: endl; 
 
 thrust :: device_ptr< float4> dptr_vals = thrust :: device_pointer_cast（reinterpret_cast< float4 *>（thrust :: raw_pointer_cast（d_vals.data（））））; 
 tpl6 my_init; 
 my_init.get< 0>（）= FLOAT_MAX; 
 my_init.get< 1>（）= FLOAT_MIN; 
 my_init.get< 2>（）= FLOAT_MAX; 
 my_init.get< 3>（）= FLOAT_MIN; 
 my_init.get< 4>（）= FLOAT_MAX; 
 my_init.get< 5>（）= FLOAT_MIN; 
 cudaEventRecord（start）; 
 tpl6 my_result = thrust :: transform_reduce（dptr_vals，dptr_vals + DSIZE，expand_functor（），my_init，minmax3_functor（））; 
 cudaEventRecord（stop）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& et，start，stop）; 
 cudaMemcpy（h_results，d_results，6 * sizeof（float），cudaMemcpyDeviceToHost）; 
 std :: cout<< 推力2经过时间：< et-< ms< std :: endl; 
 std :: cout<< thrust2 results：< std :: endl; 
 std :: cout<< x min element =< my_result.get< 0>（）<< std :: endl; 
 std :: cout<< x max element =<< my_result.get< 1>（）<< std :: endl; 
 std :: cout<< y min element =< my_result.get< 2>（）<< std :: endl; 
 std :: cout<< y max element =<< my_result.get< 3>（）<< std :: endl; 
 std :: cout<< z min element =< my_result.get< 4>（）<< std :: endl; 
 std :: cout<< z max element =<< my_result.get< 5>（）<< std :: endl; 
 return 0; 
} 
 $ nvcc -O3 -arch = sm_20 -o t491 t491.cu 
 $ ./t491 
推力消耗时间：3.88784ms 
推力结果：
x min element = 1.16788e-06 
x max element = 0.999998 
y min element = 2.85916e-07 
y max element = 1 
z min element = 1.72295e-08 
z max element = 0.999999 
内核已用时间：0.462848ms 
内核结果：
x min element = 1.16788e-06 
x max element = 0.999998 
y min element = 2.85916e -07 
y max element = 1 
z min element = 1.72295e-08 
z max element = 0.999999 
 thrust2 elapsed time：1.29728ms 
 thrust2 results：
x min element = 1.16788e-06 
x max element = 0.999998 
y min element = 2.85916e-07 
y max element = 1 
z min element = 1.72295e-08 
z max element = 0.999999 
 $ 
  
 
 
我更新了上面的示例，优化的还原内核，它在单个内核调用中执行所有6个减少（最小和最大操作）。
 
 
 正如所料，这种方法比3个单独的推力调用相同的结果，在我的情况下（RHEL5.5，Quadro5000，CUDA 6.5RC），根据数据大小，大约5-8倍更快。请注意，虽然我在这里选择了一个2的幂的数据大小（ DSIZE ），但整个示例对于任意数据大小正确工作。我已免除 
 编辑：根据@JaredHoberock的建议，我添加了一个第三种方法允许单个调用 thrust :: transform_reduce 产生所有6个结果。这些是上面的thrust2结果。这个方法比第一个（三推力调用）方法快大约3倍。仍然没有cuda内核方法那么快，但也许这种推力方法可以进一步优化。
 
I have an array of vertices with this kind of structure:

[x0, y0, z0, empty float, x1, y1, z1, empty float, x2, y2, z2, empty float, ...]

I need to find minX, minY, minZ, maxX, maxY and maxZ using CUDA. I wrote a proper reduction algorithm, but it occurs to be a little too slow. I decided to use the THRUST library. There is a highly optimized reduce(), or even better minmax_element(), method which is a way to find max and min of an array simultaneously, but I can't find a fast way to use then only every 4th index. Copying data to 3 separated arrays is not a solution I'm looking for.

Is there a way (some kind of tricks with Thrust iterators or something like this) to pass a stride to reduce()?
 解决方案 You can use a strided range method, along with 3 calls to thrust::minmax_element, to get your desired result without modifying data storage.

Here's a worked example:
$ cat t491.cu 
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <thrust/copy.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/transform_reduce.h>

#define DSIZE (1048576*2)
#define SSIZE 4
#define FSIZE (DSIZE*SSIZE)
#define nTPB 256
#define BSIZE nTPB
#define nBLKS 64
#define FLOAT_MIN (-99999)
#define FLOAT_MAX 99999


typedef thrust::tuple<float, float, float, float, float, float> tpl6;

struct expand_functor
{
  __host__ __device__
  tpl6 operator()(const float4 a){
    tpl6 result;
    result.get<0>() = a.x;
    result.get<1>() = a.x;
    result.get<2>() = a.y;
    result.get<3>() = a.y;
    result.get<4>() = a.z;
    result.get<5>() = a.z;
    return result;
  }
};


struct minmax3_functor
{
  __host__ __device__
  tpl6 operator()(const tpl6 a, const tpl6 b) {
    tpl6 result;
    result.get<0>() = (a.get<0>() < b.get<0>()) ? a.get<0>():b.get<0>();
    result.get<1>() = (a.get<1>() > b.get<1>()) ? a.get<1>():b.get<1>();
    result.get<2>() = (a.get<2>() < b.get<2>()) ? a.get<2>():b.get<2>();
    result.get<3>() = (a.get<3>() > b.get<3>()) ? a.get<3>():b.get<3>();
    result.get<4>() = (a.get<4>() < b.get<4>()) ? a.get<4>():b.get<4>();
    result.get<5>() = (a.get<5>() > b.get<5>()) ? a.get<5>():b.get<5>();
    return result;
  }
};

__device__ int bcount = 0;
__device__ float xmins[nBLKS];
__device__ float xmaxs[nBLKS];
__device__ float ymins[nBLKS];
__device__ float ymaxs[nBLKS];
__device__ float zmins[nBLKS];
__device__ float zmaxs[nBLKS];

__global__ void my_minmax3(float4 *data, float *results, size_t dsize){
  // assumes power-of-2 threadblock size
  // assumes nBLKS <= nTPB, nBLKS also power-of-2
  __shared__ float xmin[BSIZE], xmax[BSIZE], ymin[BSIZE], ymax[BSIZE], zmin[BSIZE], zmax[BSIZE];
  __shared__ int lblock;

  float my_xmin = FLOAT_MAX;
  float my_ymin = FLOAT_MAX;
  float my_zmin = FLOAT_MAX;
  float my_xmax = FLOAT_MIN;
  float my_ymax = FLOAT_MIN;
  float my_zmax = FLOAT_MIN;
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  while (idx < dsize){
    float4 my_temp = data[idx];
    if (my_xmin > my_temp.x) my_xmin = my_temp.x;
    if (my_ymin > my_temp.y) my_ymin = my_temp.y;
    if (my_zmin > my_temp.z) my_zmin = my_temp.z;
    if (my_xmax < my_temp.x) my_xmax = my_temp.x;
    if (my_ymax < my_temp.y) my_ymax = my_temp.y;
    if (my_zmax < my_temp.z) my_zmax = my_temp.z;
    idx += blockDim.x*gridDim.x;}
  xmin[threadIdx.x] = my_xmin;
  ymin[threadIdx.x] = my_ymin;
  zmin[threadIdx.x] = my_zmin;
  xmax[threadIdx.x] = my_xmax;
  ymax[threadIdx.x] = my_ymax;
  zmax[threadIdx.x] = my_zmax;
  __syncthreads();
  for (int i = blockDim.x/2; i > 0; i>>=1){
    if (threadIdx.x < i){
      if (xmin[threadIdx.x] > xmin[threadIdx.x+i]) xmin[threadIdx.x] = xmin[threadIdx.x + i];
      if (ymin[threadIdx.x] > ymin[threadIdx.x+i]) ymin[threadIdx.x] = ymin[threadIdx.x + i];
      if (zmin[threadIdx.x] > zmin[threadIdx.x+i]) zmin[threadIdx.x] = zmin[threadIdx.x + i];
      if (xmax[threadIdx.x] < xmax[threadIdx.x+i]) xmax[threadIdx.x] = xmax[threadIdx.x + i];
      if (ymax[threadIdx.x] < ymax[threadIdx.x+i]) ymax[threadIdx.x] = ymax[threadIdx.x + i];
      if (zmax[threadIdx.x] < zmax[threadIdx.x+i]) zmax[threadIdx.x] = zmax[threadIdx.x + i];
      }
    __syncthreads();
    }
  if (!threadIdx.x){
    xmins[blockIdx.x] = xmin[0];
    xmaxs[blockIdx.x] = xmax[0];
    ymins[blockIdx.x] = ymin[0];
    ymaxs[blockIdx.x] = ymax[0];
    zmins[blockIdx.x] = zmin[0];
    zmaxs[blockIdx.x] = zmax[0];
    __threadfence();
    if (atomicAdd(&bcount, 1) == (nBLKS-1)) lblock = 1;
    else lblock = 0;
    }
  __syncthreads();
  if (lblock){ // last block does final reduction
    if (threadIdx.x < nBLKS){
      xmin[threadIdx.x] = xmins[threadIdx.x];
      xmax[threadIdx.x] = xmaxs[threadIdx.x];
      ymin[threadIdx.x] = ymins[threadIdx.x];
      ymax[threadIdx.x] = ymaxs[threadIdx.x];
      zmin[threadIdx.x] = zmins[threadIdx.x];
      zmax[threadIdx.x] = zmaxs[threadIdx.x];}
    __syncthreads();
    for (int i = nBLKS/2; i > 0; i>>=1){
      if (threadIdx.x < i){
        if (xmin[threadIdx.x] > xmin[threadIdx.x+i]) xmin[threadIdx.x] = xmin[threadIdx.x + i];
        if (ymin[threadIdx.x] > ymin[threadIdx.x+i]) ymin[threadIdx.x] = ymin[threadIdx.x + i];
        if (zmin[threadIdx.x] > zmin[threadIdx.x+i]) zmin[threadIdx.x] = zmin[threadIdx.x + i];
        if (xmax[threadIdx.x] < xmax[threadIdx.x+i]) xmax[threadIdx.x] = xmax[threadIdx.x + i];
        if (ymax[threadIdx.x] < ymax[threadIdx.x+i]) ymax[threadIdx.x] = ymax[threadIdx.x + i];
        if (zmax[threadIdx.x] < zmax[threadIdx.x+i]) zmax[threadIdx.x] = zmax[threadIdx.x + i];
        }
      __syncthreads();
      }
    if (!threadIdx.x){
      results[0] = xmin[0];
      results[1] = xmax[0];
      results[2] = ymin[0];
      results[3] = ymax[0];
      results[4] = zmin[0];
      results[5] = zmax[0];
      }

   }
}

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

typedef thrust::device_vector<float>::iterator Iter;
typedef strided_range<Iter>::iterator sIter;

int main(){
// set up test data
  cudaEvent_t start, stop;
  float et;
  cudaEventCreate(&start); cudaEventCreate(&stop);
  thrust::host_vector<float> h_vals(FSIZE);
  for (int i = 0; i < DSIZE; i ++) {
    h_vals[i*SSIZE + 0] = rand()/(float)RAND_MAX;
    h_vals[i*SSIZE + 1] = rand()/(float)RAND_MAX;
    h_vals[i*SSIZE + 2] = rand()/(float)RAND_MAX;
    h_vals[i*SSIZE + 3] = 0.0f;}
  thrust::device_vector<float> d_vals = h_vals;
// set up strided ranges
  strided_range<Iter> item_x(d_vals.begin()  , d_vals.end(), SSIZE);
  strided_range<Iter> item_y(d_vals.begin()+1, d_vals.end(), SSIZE);
  strided_range<Iter> item_z(d_vals.begin()+2, d_vals.end(), SSIZE);
// find min and max
  cudaEventRecord(start);
  thrust::pair<sIter, sIter> result_x = thrust::minmax_element(item_x.begin(), item_x.end());
  thrust::pair<sIter, sIter> result_y = thrust::minmax_element(item_y.begin(), item_y.end());
  thrust::pair<sIter, sIter> result_z = thrust::minmax_element(item_z.begin(), item_z.end());
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&et, start, stop);
  std::cout << "thrust elapsed time: " << et << "ms" << std::endl;
  std::cout << "thrust results: " << std::endl;
  std::cout << "x min element = " << *(result_x.first)  << std::endl;
  std::cout << "x max element = " << *(result_x.second) << std::endl;
  std::cout << "y min element = " << *(result_y.first)  << std::endl;
  std::cout << "y max element = " << *(result_y.second) << std::endl;
  std::cout << "z min element = " << *(result_z.first)  << std::endl;
  std::cout << "z max element = " << *(result_z.second) << std::endl;

  float *h_results, *d_results;
  h_results = new float[6];
  cudaMalloc(&d_results, 6*sizeof(float));
  cudaEventRecord(start);
  my_minmax3<<<nBLKS,nTPB>>>((float4 *)thrust::raw_pointer_cast(d_vals.data()), d_results, DSIZE);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&et, start, stop);
  cudaMemcpy(h_results, d_results, 6*sizeof(float), cudaMemcpyDeviceToHost);
  std::cout << "kernel elapsed time: " << et << "ms" << std::endl;
  std::cout << "kernel results: " << std::endl;
  std::cout << "x min element = " << h_results[0]  << std::endl;
  std::cout << "x max element = " << h_results[1]  << std::endl;
  std::cout << "y min element = " << h_results[2]  << std::endl;
  std::cout << "y max element = " << h_results[3]  << std::endl;
  std::cout << "z min element = " << h_results[4]  << std::endl;
  std::cout << "z max element = " << h_results[5]  << std::endl;

  thrust::device_ptr<float4> dptr_vals = thrust::device_pointer_cast(reinterpret_cast<float4 *>( thrust::raw_pointer_cast(d_vals.data())));
  tpl6 my_init;
  my_init.get<0>() = FLOAT_MAX;
  my_init.get<1>() = FLOAT_MIN;
  my_init.get<2>() = FLOAT_MAX;
  my_init.get<3>() = FLOAT_MIN;
  my_init.get<4>() = FLOAT_MAX;
  my_init.get<5>() = FLOAT_MIN;
  cudaEventRecord(start);
  tpl6 my_result = thrust::transform_reduce(dptr_vals, dptr_vals + DSIZE, expand_functor(),  my_init, minmax3_functor());
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&et, start, stop);
  cudaMemcpy(h_results, d_results, 6*sizeof(float), cudaMemcpyDeviceToHost);
  std::cout << "thrust2 elapsed time: " << et << "ms" << std::endl;
  std::cout << "thrust2 results: " << std::endl;
  std::cout << "x min element = " << my_result.get<0>()  << std::endl;
  std::cout << "x max element = " << my_result.get<1>()  << std::endl;
  std::cout << "y min element = " << my_result.get<2>()  << std::endl;
  std::cout << "y max element = " << my_result.get<3>()  << std::endl;
  std::cout << "z min element = " << my_result.get<4>()  << std::endl;
  std::cout << "z max element = " << my_result.get<5>()  << std::endl;
  return 0;
}
$ nvcc -O3 -arch=sm_20 -o t491 t491.cu
$ ./t491
thrust elapsed time: 3.88784ms
thrust results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
kernel elapsed time: 0.462848ms
kernel results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
thrust2 elapsed time: 1.29728ms
thrust2 results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
$
I've updated the above example to include for comparison an "optimized" reduction kernel that does all 6 reductions (min and max operations) in a single kernel call.

As expected, this approach runs faster than 3 individual thrust calls to produce the same result, about 5-8 times faster in my case (RHEL5.5, Quadro5000, CUDA 6.5RC), depending on data size.  Note that although I have chosen a data size (DSIZE) here that is a power of 2, the entire example works correctly for arbitrary data sizes.  I have dispensed with proper cuda error checking for the sake of brevity of presentation.

EDIT: Following the suggestion by @JaredHoberock I have added a 3rd approach that allows a single call to thrust::transform_reduce to produce all 6 results.  These are the "thrust2" results above.  This method is about 3x faster than the first (three-thrust-call) method.  Still not as fast as the cuda kernel method, but perhaps this thrust approach can be optimized further.

                        这篇关于通过CUDA推力进行减速的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

通过CUDA推力进行减速 [英] Strided reduction by CUDA Thrust

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

通过CUDA推力进行减速 [英] Strided reduction by CUDA Thrust

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭