CUDA /推力中分段数据的向量化上界 [英] vectorized upper bound for segmented data in CUDA / thrust

查看：201 发布时间：2016/10/30 4:22:16 c++ cuda thrust

本文介绍了CUDA /推力中分段数据的向量化上界的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有以下输入数据：

  e = 0 0 0 0 0 0 | 1 1 1 
 t = 1 1 4 4 4 5 | 1 6 7 
 i = 0 1 2 3 4 5 | 6 7 8 //来自[0，n-1]的索引

数据首先按 e ，然后通过 t 。 e 是标识数据中的段的键。在这种情况下：

  segment_0 = [0,5] 
 segment_1 = [6,8]

每个区段都由 t 在这种情况下：

  sub_segment_0_0 = [0,1] // t == 1 
 sub_segment_0_1 = [2， 4] // t == 4 
 sub_segment_0_2 = [5,5] // t == 5 
 
 sub_segment_1_0 = [6,6] // t == 1 
 sub_segment_1_1 = [7,7] // t == 6 
 sub_segment_1_2 = [8,8] // t == 7

我想创建以下输出序列：

  f = 2 2 5 5 5 6 | 7 8 9 
 l = 6 6 6 6 6 6 | 9 9 9

f

l 包含（当前段中最后一个sub_segment的结束索引）段）+ 1。

对于每个段的最后一个sub_segment，两个值都应指向其结束索引。

为了计算 f ，我尝试使用 thrust :: upper_bound ，但这只有当我只有一个sub_segment：

  #include< thrust / host_vector.h> 
 #include< thrust / copy.h> 
 #include< thrust / binary_search.h> 
 #include< thrust / device_vector.h> 
 #include< stdint.h> 
 #include< iostream> 
 
 #define PRINTER（name）print（#name，（name））
 template< template< typename ...>类V，类型名T，类型名... Args> 
 void print（const char * name，const V< T，Args ...>& v）
 {
 std :: cout<名称<< ：\t; 
 thrust :: copy（v.begin（），v.end（），std :: ostream_iterator< T>（std :: cout，\t））; 
 std :: cout<< std :: endl; 
} 
 
 int main（）
 {
 uint32_t e [] = {0,0,0,0,0,0}; 
 uint32_t t [] = {1,1,4,4,4,5}; 
 uint32_t i [] = {0,1,2,3,4,5}; 
 
 int size = sizeof（i）/ sizeof（i [0]）; 
 typedef thrust :: host_vector< uint32_t> HVec; 
 typedef thrust :: device_vector< uint32_t> DVec; 
 HVec h_i（i，i + size）; 
 HVec h_e（e，e + size）; 
 HVec h_t（t，t + size）; 
 DVec d_i = h_i; 
 DVec d_e = h_e; 
 DVec d_t = h_t; 
 PRINTER（d_e）; 
 PRINTER（d_t）; 
 PRINTER（d_i）; 
 
 DVec upper（size）; 
 thrust :: upper_bound（d_t.begin（），d_t.end（），d_t.begin（），d_t.end（），upper.begin（））; 
 PRINTER（upper）; 
 
 return 0; 
}

输出：

  d_e：0 0 0 0 0 0 
 d_t：1 1 4 4 4 5 
 d_i：0 1 2 3 4 5 
 upper：2 2 5 5 5 6

如果我使用包含两个子段的输入数据，它将不再工作，因为没有 thrust :: upper_bound_by_key ：

  //在上面的代码中替换
 uint32_t e [] = {0,0,0,0,0,0,1,1,1}; 
 uint32_t t [] = {1,1,4,4,4,5,1,6,7}; 
 uint32_t i [] = {0,1,2,3,4,5,6,7,8};

输出

  d_e：0 0 0 0 0 0 1 1 1 
 d_t：1 1 4 4 4 5 1 6 7 
 d_i：0 1 2 3 4 5 6 7 8 
 upper：2 2 7 7 7 7 2 8 9

如何对我的数据实现 upper_bound_by_key ？

计算 l ？

 
 
 我对任何解决方案都开放，不是必须的。
解决方案
我发现了另一种方法。 
 
 
 为了能够使用 lower_bound ，我需要确保 t 是全球排序。为了做到这一点，我首先使用 adjacent_difference 找到每个
 sub_segment的起点。之后， scatter_if 从 counting_iterator 中为子段的每个起点复制增加的数字。最后， inclusive_scan 为每个子段传播相同的值。我将 inclusive_scan 之前的两个步骤合并到自定义函子 my_scatter 中，以实现更好的内核融合。
 
 
 现在 upper_bound 应用于这些全局递增的值，以计算 f 。 / p>

l 可以通过 upper_bound c $ c> e 。

我不知道我的方法的效率如何与@RobertCrovella提出的方法相比。

输出：

  d_e：0 0 0 0 0 0 1 1 1 
 d_t：1 1 4 4 4 5 1 6 7 
 d_i：0 1 2 3 4 5 6 7 8 
 norm_t：0 0 2 2 2 7 13 20 28 
 d_f：2 2 5 5 5 6 7 8 9 
 d_l：6 6 6 6 6 6 9 9 9

  #include< thrust / host_vector.h> ; 
 #include< thrust / copy.h> 
 #include< thrust / binary_search.h> 
 #include< thrust / device_vector.h> 
 #include< thrust / iterator / counting_iterator.h> 
 #include< thrust / iterator / transform_iterator.h> 
 #include< thrust / adjacent_difference.h> 
 #include< thrust / functional.h> 
 #include< stdint.h> 
 #include< iostream> 
 #include< thrust / scatter.h> 
 #include< thrust / scan.h> 
 #include< thrust / transform.h> 
 
 #define PRINTER（name）print（#name，（name））
 template< template< typename ...>类V，类型名T，类型名... Args> 
 void print（const char * name，const V< T，Args ...>& v）
 {
 std :: cout<名称<< ：\t; 
 thrust :: copy（v.begin（），v.end（），std :: ostream_iterator< T>（std :: cout，\t））; 
 std :: cout<< std :: endl; 
} 
 
 template< typename IteratorType，typename IndexType = uint32_t> 
 struct my_scatter：public thrust :: unary_function< IndexType，IndexType> 
 {
 my_scatter（IteratorType first）：first（first）
 {
} 
 
 __host__ __device__ 
 IndexType运算符; i）
 {
 IndexType result = i; 
 if（i> static_cast< IndexType>（0）&& *（first + i）== *（first + i-static_cast< IndexType>（1）））
 {
 result = static_cast< IndexType>（0）; 
} 
 return result; 
} 
 
 IteratorType first; 
}; 
 
 template< typename IteratorType> 
 my_scatter< IteratorType> make_my_scatter（IteratorType first）
 {
 return my_scatter< IteratorType>（first）; 
} 
 
 int main（）
 {
 uint32_t e [] = {0,0,0,0,0,0,1,1,1} ; 
 uint32_t t [] = {1,1,4,4,4,5,1,6,7}; 
 uint32_t i [] = {0,1,2,3,4,5,6,7,8}; 
 
 int size = sizeof（i）/ sizeof（i [0]）; 
 typedef thrust :: host_vector< uint32_t> HVec; 
 typedef thrust :: device_vector< uint32_t> DVec; 
 HVec h_i（i，i + size）; 
 HVec h_e（e，e + size）; 
 HVec h_t（t，t + size）; 
 DVec d_i = h_i; 
 DVec d_e = h_e; 
 DVec d_t = h_t; 
 PRINTER（d_e）; 
 PRINTER（d_t）; 
 PRINTER（d_i）; 
 
 DVec norm_t（size）; 
 
 auto my_scatter_op = make_my_scatter（zip（d_e.begin（），d_t.begin（）））; 
 auto ti_begin = thrust :: make_transform_iterator（thrust :: make_counting_iterator（0），my_scatter_op）; 
 auto ti_end = thrust :: make_transform_iterator（thrust :: make_counting_iterator（size），my_scatter_op）; 
 thrust :: inclusive_scan（ti_begin，ti_end，norm_t.begin（））; 
 PRINTER（norm_t）; 
 
 DVec d_f（size）; 
 thrust :: upper_bound（norm_t.begin（），norm_t.end（），norm_t.begin（），norm_t.end（），d_f.begin（））; 
 PRINTER（d_f）; 
 
 DVec d_l（size）; 
 thrust :: upper_bound（d_e.begin（），d_e.end（），d_e.begin（），d_e.end（），d_l.begin（））; 
 PRINTER（d_l）; 
 
 return 0; 
}

I have the following input data:

e = 0 0 0 0 0 0 | 1 1 1
t = 1 1 4 4 4 5 | 1 6 7
i = 0 1 2 3 4 5 | 6 7 8 // indices from [0,n-1]

The data is first sorted by e, then by t. e is the key which identifies segments in the data. In this case:

segment_0 = [0,5]
segment_1 = [6,8]

Each segment is again segmented by t. In this case:

sub_segment_0_0 = [0,1] // t==1
sub_segment_0_1 = [2,4] // t==4
sub_segment_0_2 = [5,5] // t==5

sub_segment_1_0 = [6,6] // t==1
sub_segment_1_1 = [7,7] // t==6
sub_segment_1_2 = [8,8] // t==7

I want to create the following output sequences:

f = 2 2 5 5 5 6 | 7 8 9
l = 6 6 6 6 6 6 | 9 9 9

f contains the start index of the next sub_segment within the current segment.

l contains (the end index of the last sub_segment within the current segment) + 1.

For the last sub_segment of each segment both values should point to its end index.

In order to calculate f, I tried using thrust::upper_bound, but this only works if I have just one sub_segment:

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <stdint.h>
#include <iostream>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0};
    uint32_t t[] = {1,1,4,4,4,5};
    uint32_t i[] = {0,1,2,3,4,5};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec upper(size);
    thrust::upper_bound(d_t.begin(), d_t.end(), d_t.begin(), d_t.end(), upper.begin());
    PRINTER(upper);

    return 0;
}

output:

d_e:    0   0   0   0   0   0   
d_t:    1   1   4   4   4   5   
d_i:    0   1   2   3   4   5   
upper:  2   2   5   5   5   6

If I use the input data containing two sub_segments, it won't work anymore, since there is no thrust::upper_bound_by_key:

// replace in the code above
uint32_t e[] = {0,0,0,0,0,0,1,1,1};
uint32_t t[] = {1,1,4,4,4,5,1,6,7};
uint32_t i[] = {0,1,2,3,4,5,6,7,8};

output

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
upper:  2   2   7   7   7   7   2   8   9

How would a upper_bound_by_key be implemented for my data?

And how can I efficiently calculate l?

I am open to any solution, thrust is not a necessity.

解决方案

I found another way to do this.

In order to be able to use lower_bound, I needed to make sure that t is globally sorted. In order to do that, I first find out the starting points of each sub_segment using adjacent_difference. After that, scatter_if copies increasing numbers from a counting_iterator for each starting point of a subsegment. Finally, inclusive_scan spreads same values for each subsegment. I combined the two steps before the inclusive_scan into the custom functor my_scatter to achieve better kernel fusing.

Now upper_bound is applied to these globally increasing values to calculate f.

l can be calculated by applying upper_bound on e.

I am not sure how the efficiency of my approach compares to the approach presented by @RobertCrovella.

output:

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
norm_t: 0   0   2   2   2   7   13  20  28  
d_f:    2   2   5   5   5   6   7   8   9   
d_l:    6   6   6   6   6   6   9   9   9

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/adjacent_difference.h>
#include <thrust/functional.h>
#include <stdint.h>
#include <iostream>
#include <thrust/scatter.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
    my_scatter(IteratorType first) : first(first)
    {
    }

   __host__ __device__
   IndexType operator()(const IndexType& i)
   {
      IndexType result = i;
      if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
      { 
          result = static_cast<IndexType>(0);
      }
      return result;
   }

   IteratorType first;
};

template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
  return my_scatter<IteratorType>(first);
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};
    uint32_t i[] = {0,1,2,3,4,5,6,7,8};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;    
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec norm_t(size);

    auto my_scatter_op =  make_my_scatter(zip(d_e.begin(), d_t.begin()));
    auto ti_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), my_scatter_op);
    auto ti_end = thrust::make_transform_iterator(thrust::make_counting_iterator(size), my_scatter_op);
    thrust::inclusive_scan(ti_begin, ti_end, norm_t.begin());
    PRINTER(norm_t);

    DVec d_f(size);
    thrust::upper_bound(norm_t.begin(), norm_t.end(), norm_t.begin(), norm_t.end(), d_f.begin());    
    PRINTER(d_f);

    DVec d_l(size);
    thrust::upper_bound(d_e.begin(), d_e.end(), d_e.begin(), d_e.end(), d_l.begin());    
    PRINTER(d_l);

    return 0;
}

这篇关于CUDA /推力中分段数据的向量化上界的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

CUDA /推力中分段数据的向量化上界 [英] vectorized upper bound for segmented data in CUDA / thrust

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

CUDA /推力中分段数据的向量化上界 [英] vectorized upper bound for segmented data in CUDA / thrust

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭