CUDA Thrust - 计算匹配子阵列 [英] CUDA Thrust - Counting matching subarrays

查看：204 发布时间：2017/3/4 15:35:26 c++ arrays cuda thrust entropy

本文介绍了CUDA Thrust - 计算匹配子阵列的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我想弄清楚是否可以使用CUDA有效地计算一组数字的条件熵。您可以通过将数组划分为多个窗口，然后计算不同长度的匹配子数组/子串的数量来计算条件熵。对于每个子阵列长度，您通过将匹配的子阵列计数乘以这些计数的对数来计算熵。

为了给出一个更清楚的例子，我的意思是，这里是完整的计算：

初始数组是[1,2,3,5,1,2,5]。假设窗口大小为3，则必须分为五个窗口：[1,2,3]，[2,3,5]，[3,5,1]，[5,1,2]和[1 ，2,5]。

接下来，查看每个窗口，我们要为每个长度找到匹配的子阵列。

长度为1的子阵列是[1]，[2]，[3]，[5]，[1]。有两个1，每个其他号码之一。因此，熵为log（2） 2 + 4 （log（1）* 1）= 0.6。

长度2为[1,2]，[2,3]，[3,5]，[5,1]和[1,2]。有两个[1,2]，和四个独特的子阵列。熵与长度1，log（2）2 + 4（log（1）* 1）= 0.6相同。

长度为3的子阵列是全窗口：[1,2,3]，[2,3,5]，[3,5,1]，[5,1,2]和[1,2， 5]。所有五个窗口是唯一的，因此熵为5 *（log（1）* 1）= 0。

最小熵为0，该数组的条件熵。

这也可以表示为树，其中每个节点的计数表示有多少匹配存在。每个子阵列长度的熵等于树的每个级别的熵。

如果可能，喜欢同时对许多数组执行此计算，并且也并行执行计算本身。有没有人有如何完成这一建议？推力有用吗？

解决方案

我尝试使用推力解决您的问题。它工作，但它导致推力电话很多。
由于您的输入大小相当小，您应该并行处理多个数组。
但是，这样做会导致大量的记账工作，您将在以下代码中看到这一点。

您的输入范围限制为 [1,5] ，相当于 [0,4] 。一般的想法是（理论上）这个范围之外的任何元组（例如 {1,2,3} 可以表示为基数4中的数字（例如 1 + 2 * 4 + 3 * 16 = 57 ）
在实践中，我们受整数类型的大小限制，对于32位无符号整数，最大元组大小 16 这也是以下代码可以处理的最大窗口大小（更改为64位无符号整数将导致最大元组大小<$ c $

让我们假设输入数据的结构如下：
我们有 2 数组，每个数组的大小 5 ，窗口大小为 3 。

  {{0,0,3,4,4}，{0,2,1,1,3 }}

我们现在可以生成所有窗口：

  {{0,0,3}，{0,3,4}，{3,4,4}}，{{0,2,1}，{ 1,1}，{1,1,3}}

使用每个元组的前缀和并应用上述每个元组的表示作为单个4进制数，我们得到：

  {{0,0,48} {0,12,76}，{3,19,83}}，{{0,8,24}，{2,6,22}，{1,5,53}}

现在我们重新排序值，所以我们有数字代表一个特定长度的子数组旁边：

  {{0,0,3}，{0,12,19}，{48,76,83}}，{0,2,1 }，{8,6,5}，{24,22,53}}

  {{0,0,3}，{0,12,19}，{48,76， 83}}，{0,1,2}，{5,6,8}，{22,24,53}}

现在我们可以计算每个组中出现一个数字的频率：

  2,1,1， 1,1,1,1,1,1,1,1,1,1,1,1,1,1

在

中应用日志公式结果

  0.60206,0,0,0,0,0

现在我们获取每个数组的最小值：

0,0

  #include< thrust / device_vector.h> 
 #include< thrust / copy.h> 
 #include< thrust / transform.h> 
 #include< thrust / iterator / counting_iterator.h> 
 #include< thrust / iterator / zip_iterator.h> 
 #include< thrust / functional.h> 
 #include< thrust / random.h> 
 #include< iostream> 
 #include< thrust / tuple.h> 
 #include< thrust / reduce.h> 
 #include< thrust / scan.h> 
 #include< thrust / gather.h> 
 #include< thrust / sort.h> 
 #include< math.h> 
 
 #include< chrono> 
 
 #ifdef PRINT_ENABLED 
 #define PRINTER（name）print（#name，（name））
 #else 
 #define PRINTER（name）
 #endif 
 
 template< template< typename ...>类V，类型名T，类型名... Args> 
 void print（const char * name，const V< T，Args ...>& v）
 {
 std :: cout< name<< ：\t; 
 thrust :: copy（v.begin（），v.end（），std :: ostream_iterator< T>（std :: cout，\t））; 
 std :: cout<< std :: endl; 
} 
 
 
 template< typename Integer，Integer Min，Integer Max> 
 struct random_filler 
 {
 __device__ 
 Integer operator（）（std :: size_t index）const 
 {
 thrust :: default_random_engine rng; 
 thrust :: uniform_int_distribution< Integer> dist（Min，Max）; 
 rng.discard（index）; 
 return dist（rng）; 
} 
}; 
 
 template< std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 typename T，
 std： ：size_t WindowCount = ArraySize-（WindowSize-1），
 std :: size_t PerArrayCount = WindowSize * WindowCount> 
 
 __device__ __inline__ 
 thrust :: tuple< T，T，T，T> calc_indices（const T& i0）
 {
 const T i1 = i0 / PerArrayCount; 
 const T i2 = i0％PerArrayCount; 
 const T i3 = i2 / WindowSize; 
 const T i4 = i2％WindowSize; 
 return thrust :: make_tuple（i1，i2，i3，i4）; 
} 
 
 template< typename Iterator，
 std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 std :: size_t WindowCount = ArraySize  - （WindowSize-1），
 std :: size_t PerArrayCount = WindowSize * WindowCount，
 std :: size_t TotalCount = PerArrayCount * ArrayCount 
> 
 class sliding_window 
 {
 public：
 
 typedef typename thrust :: iterator_difference< Iterator> :: type difference_type; 
 
 struct window_functor：public thrust :: unary_function< difference_type，difference_type> 
 {
 __host__ __device__ 
 difference_type operator（）（const difference_type& i0）const 
 {
 auto t = calc_indices< ArraySize，ArrayCount，WindowSize>（i0） 
 
 return thrust :: get< 0>（t）* ArraySize + thrust :: get 2（t）+ thrust :: get 3 
} 
}; 
 
 typedef typename thrust :: counting_iterator< difference_type>计数迭代器; 
 typedef typename thrust :: transform_iterator< window_functor，CountingIterator> TransformIterator; 
 typedef typename thrust :: permutation_iterator< Iterator，TransformIterator> PermutationIterator; 
 
 typedef PermutationIterator iterator; 
 
 sliding_window（Iterator first）：first（first）{} 
 
迭代器begin（void）const 
 {
 return PermutationIterator（first，TransformIterator CountingIterator（0），window_functor（）））; 
} 
 
 iterator end（void）const 
 {
 return begin（）+ TotalCount; 
} 
 
 protected：
迭代器先; 
}; 
 
 template< std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 typename Iterator> 
 sliding_window< Iterator，ArraySize，ArrayCount，WindowSize> 
 make_sliding_window（Iterator first）
 {
 return sliding_window< Iterator，ArraySize，ArrayCount，WindowSize>（first）; 
} 
 
 
模板< typename KeyType，
 std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize> 
 struct key_generator：thrust :: unary_function< KeyType，thrust :: tuple< KeyType，KeyType> > 
 {
 __device__ 
 thrust :: tuple< KeyType，KeyType> operator（）（std :: size_t i0）const 
 {
 
 auto t = calc_indices< ArraySize，ArrayCount，WindowSize>（i0）; 
 return thrust :: make_tuple（thrust :: get< 0>（t），thrust :: get 2（t））; 
} 
}; 
 
 
 template< typename Integer，
 std :: size_t Base，
 std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize> 
 struct base_n：thrust :: unary_function< thrust :: tuple< Integer，Integer>，Integer> 
 {
 __host__ __device__ 
 Integer operator（）（const thrust :: tuple< Integer，Integer> t）const 
 {
 const auto i = calc_indices& ArrayCount，WindowSize>（thrust :: get< 0（t））; 
 
 // ipow可以通过在编译时预先计算查找表来优化
 const auto result = thrust :: get< 1>（t）* ipow（Base，thrust :: get< 3（i））; 
 return result; 
} 
 
 //取自http://stackoverflow.com/a/101613/678093 
 __host__ __device__ __inline__ 
整数ipow（整数基数，整数exp） const 
 {
整数result = 1; 
 while（exp）
 {
 if（exp& 1）
 result * = base; 
 exp>> = 1; 
 base * = base; 
} 
 
返回结果; 
} 
}; 
 
 template< std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 typename T，
 std： ：size_t WindowCount = ArraySize-（WindowSize-1），
 std :: size_t PerArrayCount = WindowSize * WindowCount> 
 
 __device__ __inline__ 
 thrust :: tuple< T，T，T，T> calc_sort_indices（const T& i0）
 {
 const T i1 = i0％PerArrayCount; 
 const T i2 = i0 / PerArrayCount; 
 const T i3 = i1％WindowCount; 
 const T i4 = i1 / WindowCount; 
 return thrust :: make_tuple（i1，i2，i3，i4）; 
} 
 
 template< typename Integer，
 std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 std :: size_t WindowCount = ArraySize-（WindowSize-1），
 std :: size_t PerArrayCount = WindowSize * WindowCount> 
 struct pre_sort：thrust :: unary_function< Integer，Integer> 
 {
 __device__ 
 Integer operator（）（Integer i0）const 
 {
 
 auto t = calc_sort_indices< ArraySize，ArrayCount，WindowSize>（i0） ; 
 
 
 const Integer i_result =（thrust :: get 2（t）* WindowSize + thrust :: get 3（t））+ thrust :: get t）* PerArrayCount; 
 return i_result; 
} 
}; 
 
 
 template< typename Integer，
 std :: size_t ArraySize，
 std :: size_t ArrayCount，
 std :: size_t WindowSize，
 std :: size_t WindowCount = ArraySize-（WindowSize-1），
 std :: size_t PerArrayCount = WindowSize * WindowCount> 
 struct generate_sort_keys：thrust :: unary_function< Integer，Integer> 
 {
 __device__ 
 thrust :: tuple< Integer，Integer> operator（）（Integer i0）const 
 {
 
 auto t = calc_sort_indices< ArraySize，ArrayCount，WindowSize>（i0）; 
 
 return thrust :: make_tuple（thrust :: get 1（t），thrust :: get 3（t））; 
} 
}; 
 
 
 template< typename ... Iterators> 
 __host__ __device__ 
 thrust :: zip_iterator< thrust :: tuple< Iterators ...>> zip（Iterators ... its）
 {
 return thrust :: make_zip_iterator（thrust :: make_tuple（its ...））; 
} 
 
 
 struct calculate_log：thrust :: unary_function< std :: size_t，float> 
 {
 __host__ __device__ 
 float operator（）（std :: size_t i）const 
 {
 return i * log10f（i）; 
} 
}; 
 
 
 int main（）
 {
 typedef int Integer; 
 typedef float Real; 
 
 const std :: size_t array_count = ARRAY_COUNT; 
 const std :: size_t array_size = ARRAY_SIZE; 
 
 const std :: size_t window_size = WINDOW_SIZE; 
 
 const std :: size_t window_count = array_size  - （window_size-1）; 
 
 const std :: size_t input_size = array_count * array_size; 
 
 const std :: size_t base = 4; 
 
 thrust :: device_vector< Integer> input_arrays（input_size）; 
 
 thrust :: counting_iterator< Integer> counting_it（0）; 
 
 thrust :: transform（counting_it，
 counting_it + input_size，
 input_arrays.begin（），
 random_filler< Integer，0，base>（））; 
 
 PRINTER（input_arrays）; 
 
 const int runs = 100; 
 
 auto start = std :: chrono :: high_resolution_clock :: now（）; 
 for（int k = 0; k  {
 auto sw = make_sliding_window< array_size，array_count，window_size>（input_arrays.begin（））; 
 
 const std :: size_t total_count = window_size * window_count * array_count; 
 
 thrust :: device_vector< Integer> result（total_count）; 
 
 thrust :: copy（sw.begin（），sw.end（），result.begin（））; 
 
 PRINTER（result）; 
 
 
 auto ti_begin = thrust :: make_transform_iterator（counting_it，key_generator< Integer，array_size，array_count，window_size>（））; 
 
 auto base_4_ti = thrust :: make_transform_iterator（zip（counting_it，sw.begin（）），base_n< Integer，base，array_size，array_count，window_size>（））; 
 
 thrust :: inclusive_scan_by_key（ti_begin，ti_begin + total_count，base_4_ti，result.begin（））; 
 PRINTER（result）; 
 
 thrust :: device_vector< Integer> result_2（total_count）; 
 
 auto ti_pre_sort = thrust :: make_transform_iterator（counting_it，pre_sort< Integer，array_size，array_count，window_size>（））; 
 
 thrust :: gather（ti_pre_sort，
 ti_pre_sort + total_count，
 result.begin（），
 result_2.begin（））; 
 
 PRINTER（result_2）; 
 
 
 thrust :: device_vector< Integer> sort_keys_1（total_count）; 
 thrust :: device_vector< Integer> sort_keys_2（total_count）; 
 
 auto zip_begin = zip（sort_keys_1.begin（），sort_keys_2.begin（））; 
 
 thrust :: transform（counting_it，
 counting_it + total_count，
 zip_begin，
 generate_sort_keys< Integer，array_size，array_count，window_size>（））; 
 
 thrust :: stable_sort_by_key（result_2.begin（），result_2.end（），zip_begin）; 
 thrust :: stable_sort_by_key（zip_begin，zip_begin + total_count，result_2.begin（））; 
 
 PRINTER（result_2）; 
 
 thrust :: device_vector< Integer> key_counts（total_count）; 
 thrust :: device_vector< Integer> sort_keys_1_reduced（total_count）; 
 thrust :: device_vector< Integer> sort_keys_2_reduced（total_count）; 
 
 //计算每个子数组的出现频率
 auto zip_count_begin = zip（sort_keys_1.begin（），sort_keys_2.begin（），result_2.begin（））; 
 auto new_end = thrust :: reduce_by_key（zip_count_begin，
 zip_count_begin + total_count，
 thrust :: constant_iterator< Integer>（1），
 zip（sort_keys_1_reduced.begin（），sort_keys_2_reduced .begin（），thrust :: make_discard_iterator（）），
 key_counts.begin（）
）; 
 
 std :: size_t new_size = new_end.second  -  key_counts.begin（）; 
 key_counts.resize（new_size）; 
 sort_keys_1_reduced.resize（new_size）; 
 sort_keys_2_reduced.resize（new_size）; 
 PRINTER（key_counts）; 
 PRINTER（sort_keys_1_reduced）; 
 PRINTER（sort_keys_2_reduced）; 
 
 
 auto log_ti = thrust :: make_transform_iterator（key_counts.begin（），calculate_log（））; 
 
 thrust :: device_vector< Real> log_result（new_size）; 
 auto zip_keys_reduced_begin = zip（sort_keys_1_reduced.begin（），sort_keys_2_reduced.begin（））; 
 auto log_end = thrust :: reduce_by_key（zip_keys_reduced_begin，
 zip_keys_reduced_begin + new_size，
 log_ti，
 zip（sort_keys_1.begin（），thrust :: make_discard_iterator（）），
 log_result.begin（）
）; 
 std :: size_t final_size = log_end.second  -  log_result.begin（）; 
 log_result.resize（final_size）; 
 sort_keys_1.resize（final_size）; 
 PRINTER（log_result）; 
 
 thrust :: device_vector< Real> final_result（final_size）; 
 auto final_end = thrust :: reduce_by_key（sort_keys_1.begin（），
 sort_keys_1.begin（）+ final_size，
 log_result.begin（），
 thrust :: make_discard_iterator ，
 final_result.begin（），
 thrust :: equal_to< Integer>（），
 thrust :: minimum< Real>（）
）; 
 
 final_result.resize（final_end.second-final_result.begin（））; 
 
 PRINTER（final_result）; 
} 
 
 auto duration = std :: chrono :: duration_cast< std :: chrono :: milliseconds>（std :: chrono :: high_resolution_clock :: now（） -  start）; 
 std :: cout<< take<< duration.count（）/ runs<< milliseconds<< std :: endl; 
 
 return 0; 
}

使用编译

  nvcc -std = c ++ 11 conditional_entropy.cu -o benchmark -DARRAY_SIZE = 1000 -DARRAY_COUNT = 1000 -DWINDOW_SIZE = 10& ./benchmark

这个配置在我的GPU（GTX 680）上需要133毫秒，所以大约0.1毫秒数组。

实现可以被优化，例如使用预先计算的查找表用于基本4转换，并且可以避免一些推力调用。

I'm trying to figure out if it's possible to efficiently calculate the conditional entropy of a set of numbers using CUDA. You can calculate the conditional entropy by dividing an array into windows, then counting the number of matching subarrays/substrings for different lengths. For each subarray length, you calculate the entropy by adding together the matching subarray counts times the log of those counts. Then, whatever you get as the minimum entropy is the conditional entropy.

To give a more clear example of what I mean, here is full calculation:

The initial array is [1,2,3,5,1,2,5]. Assuming the window size is 3, this must be divided into five windows: [1,2,3], [2,3,5], [3,5,1], [5,1,2], and [1,2,5].
Next, looking at each window, we want to find the matching subarrays for each length.
The subarrays of length 1 are [1],[2],[3],[5],[1]. There are two 1s, and one of each other number. So the entropy is log(2)2 + 4(log(1)*1) = 0.6.
The subarrays of length 2 are [1,2], [2,3], [3,5], [5,1], and [1,2]. There are two [1,2]s, and four unique subarrays. The entropy is the same as length 1, log(2)2 + 4(log(1)*1) = 0.6.
The subarrays of length 3 are the full windows: [1,2,3], [2,3,5], [3,5,1], [5,1,2], and [1,2,5]. All five windows are unique, so the entropy is 5*(log(1)*1) = 0.
The minimum entropy is 0, meaning it is the conditional entropy for this array.

This can also be presented as a tree, where the counts at each node represent how many matches exist. The entropy for each subarray length is equivalent to the entropy for each level of the tree.

If possible, I'd like to perform this calculation on many arrays at once, and also perform the calculation itself in parallel. Does anyone have suggestions on how to accomplish this? Could thrust be useful? Please let me know if there is any additional information I should provide.

解决方案

I tried solving your problem using thrust. It works, but it results in a lot of thrust calls. Since your input size is rather small, you should process multiple arrays in parallel. However, doing this results in a lot of book-keeping effort, you will see this in the following code.

Your input range is limited to [1,5], which is equivalent to [0,4]. The general idea is that (theoretically) any tuple out of this range (e.g. {1,2,3} can be represented as a number in base 4 (e.g. 1+2*4+3*16 = 57). In practice we are limited by the size of the integer type. For a 32bit unsigned integer this will lead to a maximum tuple size of 16. This is also the maximum window size the following code can handle (changing to a 64bit unsigned integer will lead to a maximum tuple size of 32).

Let's assume the input data is structured like this: We have 2 arrays we want to process in parallel, each array is of size 5 and window size is 3.

{{0,0,3,4,4},{0,2,1,1,3}}

We can now generate all windows:

{{0,0,3},{0,3,4},{3,4,4}},{{0,2,1},{2,1,1},{1,1,3}}

Using a per tuple prefix sum and applying the aforementioned representation of each tuple as a single base-4 number, we get:

{{0,0,48},{0,12,76},{3,19,83}},{{0,8,24},{2,6,22},{1,5,53}}

Now we reorder the values so we have the numbers which represent a subarray of a specific length next to each other:

{{0,0,3},{0,12,19},{48,76,83}},{0,2,1},{8,6,5},{24,22,53}}

We then sort within each group:

{{0,0,3},{0,12,19},{48,76,83}},{0,1,2},{5,6,8},{22,24,53}}

Now we can count how often a number occurs in each group:

2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

Applying the log-formula results in

0.60206,0,0,0,0,0

Now we fetch the minimum value per array:

0,0

#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/gather.h>
#include <thrust/sort.h>
#include <math.h>

#include <chrono>

#ifdef PRINT_ENABLED
#define PRINTER(name) print(#name, (name))
#else
#define PRINTER(name)
#endif

template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}


template <typename Integer, Integer Min, Integer Max>
struct random_filler
{
    __device__
    Integer operator()(std::size_t index) const
    {
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<Integer> dist(Min, Max);
        rng.discard(index);
        return dist(rng);
    }
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename T,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>

__device__ __inline__          
thrust::tuple<T,T,T,T> calc_indices(const T& i0)
{
    const T i1 = i0 / PerArrayCount;
    const T i2 = i0 % PerArrayCount;
    const T i3 = i2 / WindowSize;
    const T i4 = i2 % WindowSize;
    return thrust::make_tuple(i1,i2,i3,i4);
}

template <typename Iterator, 
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount,
          std::size_t TotalCount =  PerArrayCount * ArrayCount
          >
class sliding_window
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct window_functor : public thrust::unary_function<difference_type,difference_type>
    {
        __host__ __device__
        difference_type operator()(const difference_type& i0) const
        { 
            auto t = calc_indices<ArraySize, ArrayCount,WindowSize>(i0);

            return thrust::get<0>(t) * ArraySize + thrust::get<2>(t)  + thrust::get<3>(t);
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<window_functor, CountingIterator>   TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    typedef PermutationIterator iterator;

    sliding_window(Iterator first) : first(first){}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), window_functor()));
    }

    iterator end(void) const
    {
        return begin() + TotalCount;
    }

    protected:
    Iterator first;
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename Iterator>
sliding_window<Iterator, ArraySize, ArrayCount, WindowSize>
make_sliding_window(Iterator first)
{
    return sliding_window<Iterator, ArraySize, ArrayCount, WindowSize>(first);
}


template <typename KeyType, 
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize> 
struct key_generator : thrust::unary_function<KeyType, thrust::tuple<KeyType,KeyType> >
{
    __device__
    thrust::tuple<KeyType,KeyType> operator()(std::size_t i0) const
    {        

        auto t = calc_indices<ArraySize, ArrayCount,WindowSize>(i0);
        return thrust::make_tuple(thrust::get<0>(t),thrust::get<2>(t));
    }
};


template <typename Integer,
          std::size_t Base,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize>
struct base_n : thrust::unary_function<thrust::tuple<Integer, Integer>, Integer>
{
    __host__ __device__ 
    Integer operator()(const thrust::tuple<Integer, Integer> t) const
    {    
        const auto i = calc_indices<ArraySize, ArrayCount, WindowSize>(thrust::get<0>(t));

        // ipow could be optimized by precomputing a lookup table at compile time
        const auto result =  thrust::get<1>(t)*ipow(Base, thrust::get<3>(i));
        return result;
    }

    // taken from http://stackoverflow.com/a/101613/678093
    __host__ __device__ __inline__
    Integer ipow(Integer base, Integer exp) const
    {
        Integer result = 1;
        while (exp)
        {
            if (exp & 1)
                result *= base;
            exp >>= 1;
            base *= base;
        }

        return result;
    }
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename T,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>

__device__ __inline__          
thrust::tuple<T,T,T,T> calc_sort_indices(const T& i0)
{
    const T i1 = i0 % PerArrayCount;
    const T i2 = i0 / PerArrayCount;
    const T i3 = i1 % WindowCount;
    const T i4 = i1 / WindowCount;
    return thrust::make_tuple(i1,i2,i3,i4);
}

template <typename Integer,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>
struct pre_sort : thrust::unary_function<Integer, Integer>
{
    __device__
    Integer operator()(Integer i0) const
    {

        auto t = calc_sort_indices<ArraySize, ArrayCount,WindowSize>(i0);


        const Integer i_result = ( thrust::get<2>(t)  * WindowSize  + thrust::get<3>(t) ) + thrust::get<1>(t) * PerArrayCount;
        return i_result;
    }
};


template <typename Integer,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>
struct generate_sort_keys : thrust::unary_function<Integer, Integer>
{
    __device__
    thrust::tuple<Integer,Integer> operator()(Integer i0) const
    {

        auto t = calc_sort_indices<ArraySize, ArrayCount,WindowSize>(i0);

        return thrust::make_tuple( thrust::get<1>(t), thrust::get<3>(t));
    }
};


template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}


struct calculate_log : thrust::unary_function<std::size_t, float>
{
  __host__ __device__
  float operator()(std::size_t i) const
  {
    return i*log10f(i);
  }
};


int main()
{
    typedef int Integer;
    typedef float Real;

    const std::size_t array_count = ARRAY_COUNT;
    const std::size_t array_size = ARRAY_SIZE;

    const std::size_t window_size = WINDOW_SIZE;

    const std::size_t window_count = array_size - (window_size-1);

    const std::size_t input_size = array_count * array_size;

    const std::size_t base = 4;

    thrust::device_vector<Integer> input_arrays(input_size);

    thrust::counting_iterator<Integer> counting_it(0);

    thrust::transform(counting_it,
                      counting_it + input_size,
                      input_arrays.begin(),
                      random_filler<Integer,0,base>());

    PRINTER(input_arrays);

    const int runs = 100;

    auto start = std::chrono::high_resolution_clock::now();
    for (int k = 0 ; k < runs; ++k)
    {
      auto sw = make_sliding_window<array_size, array_count, window_size>(input_arrays.begin());

      const std::size_t total_count = window_size * window_count * array_count;

      thrust::device_vector<Integer> result(total_count);

      thrust::copy(sw.begin(), sw.end(), result.begin());

      PRINTER(result);


      auto ti_begin = thrust::make_transform_iterator(counting_it, key_generator<Integer, array_size, array_count, window_size>());

      auto base_4_ti = thrust::make_transform_iterator(zip(counting_it, sw.begin()), base_n<Integer, base, array_size, array_count, window_size>());

      thrust::inclusive_scan_by_key(ti_begin, ti_begin+total_count, base_4_ti, result.begin());
      PRINTER(result);

      thrust::device_vector<Integer> result_2(total_count);

      auto ti_pre_sort = thrust::make_transform_iterator(counting_it, pre_sort<Integer, array_size, array_count, window_size>());

      thrust::gather(ti_pre_sort,
                    ti_pre_sort+total_count,
                    result.begin(),
                    result_2.begin());

      PRINTER(result_2);


      thrust::device_vector<Integer> sort_keys_1(total_count);
      thrust::device_vector<Integer> sort_keys_2(total_count);

      auto zip_begin = zip(sort_keys_1.begin(),sort_keys_2.begin());

      thrust::transform(counting_it,
                        counting_it+total_count,
                        zip_begin,
                        generate_sort_keys<Integer, array_size, array_count, window_size>());

      thrust::stable_sort_by_key(result_2.begin(), result_2.end(), zip_begin);
      thrust::stable_sort_by_key(zip_begin, zip_begin+total_count, result_2.begin());

      PRINTER(result_2);

      thrust::device_vector<Integer> key_counts(total_count);
      thrust::device_vector<Integer> sort_keys_1_reduced(total_count);
      thrust::device_vector<Integer> sort_keys_2_reduced(total_count);

      // count how often each sub array occurs
      auto zip_count_begin = zip(sort_keys_1.begin(), sort_keys_2.begin(), result_2.begin());
      auto new_end = thrust::reduce_by_key(zip_count_begin,
                                          zip_count_begin + total_count,
                                          thrust::constant_iterator<Integer>(1),
                                          zip(sort_keys_1_reduced.begin(), sort_keys_2_reduced.begin(), thrust::make_discard_iterator()),
                                          key_counts.begin()
                                          );

      std::size_t new_size = new_end.second - key_counts.begin();
      key_counts.resize(new_size);
      sort_keys_1_reduced.resize(new_size);
      sort_keys_2_reduced.resize(new_size);
      PRINTER(key_counts);
      PRINTER(sort_keys_1_reduced);
      PRINTER(sort_keys_2_reduced);


      auto log_ti = thrust::make_transform_iterator (key_counts.begin(), calculate_log());

      thrust::device_vector<Real> log_result(new_size);
      auto zip_keys_reduced_begin = zip(sort_keys_1_reduced.begin(), sort_keys_2_reduced.begin());
      auto log_end = thrust::reduce_by_key(zip_keys_reduced_begin,
                                          zip_keys_reduced_begin + new_size,
                                          log_ti,
                                          zip(sort_keys_1.begin(),thrust::make_discard_iterator()),
                                          log_result.begin()
                            );
      std::size_t final_size = log_end.second - log_result.begin();
      log_result.resize(final_size);
      sort_keys_1.resize(final_size);
      PRINTER(log_result);

      thrust::device_vector<Real> final_result(final_size);
      auto final_end = thrust::reduce_by_key(sort_keys_1.begin(),
                                            sort_keys_1.begin() + final_size,
                                            log_result.begin(),
                                            thrust::make_discard_iterator(),
                                            final_result.begin(),
                                            thrust::equal_to<Integer>(),
                                            thrust::minimum<Real>()
                            );

      final_result.resize(final_end.second-final_result.begin());

      PRINTER(final_result);
    }

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start);
    std::cout << "took " << duration.count()/runs << "milliseconds" << std::endl;

    return 0;
}

compile using

nvcc -std=c++11 conditional_entropy.cu -o benchmark -DARRAY_SIZE=1000 -DARRAY_COUNT=1000 -DWINDOW_SIZE=10 && ./benchmark

This configuration takes 133 milliseconds on my GPU (GTX 680), so around 0.1 milliseconds per array.

The implementation can definitely be optimized, e.g. using a precomputed lookup table for the base-4 conversion and maybe some of the thrust calls can be avoided.

这篇关于CUDA Thrust - 计算匹配子阵列的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

CUDA Thrust - 计算匹配子阵列 [英] CUDA Thrust - Counting matching subarrays

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

CUDA Thrust - 计算匹配子阵列 [英] CUDA Thrust - Counting matching subarrays

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭