使用CUDA的推力与数组来代替向量来inclusive_scan [英] Using cuda thrust with arrays instead vectors to inclusive_scan

查看:306
本文介绍了使用CUDA的推力与数组来代替向量来inclusive_scan的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有@ M.S给予code:

I have a code given by @m.s.:

#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>

struct omit_negative : public thrust::unary_function<int, int>
{
  __host__ __device__
  int operator()(int value)
  {
    if (value<0)
    {
      value = 0;
    }
    return value;
  }
};

int main()
{
  int array[] = {2,1,-1,3,-1,2};
  const int array_size = sizeof(array)/sizeof(array[0]);
  thrust::device_vector<int> d_array(array, array + array_size);
  thrust::device_vector<int> d_result(array_size);

  std::cout << "input data" << std::endl;
  thrust::copy(d_array.begin(), d_array.end(), std::ostream_iterator<int>(std::cout, " "));

  thrust::inclusive_scan(thrust::make_transform_iterator(d_array.begin(), omit_negative()),
                         thrust::make_transform_iterator(d_array.end(),   omit_negative()),
                         d_result.begin());

  std::cout << std::endl << "after inclusive_scan" << std::endl;
  thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, " "));

  using namespace thrust::placeholders;
  thrust::scatter_if(d_array.begin(),
                     d_array.end(),
                     thrust::make_counting_iterator(0),
                     d_array.begin(),
                     d_result.begin(),
                     _1<0
                    );

  std::cout << std::endl << "after scatter_if" << std::endl;
  thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
}

它是指 previous问题

我不知道的推力,但现在我想我会放弃自己的写作code的想法。我宁愿使用的推力。我修改我的算法:而不是-1有0(所以make_transform是没有必要的)。此外,您的示例创建主机阵列。但实际上我有存储在设备prepared阵列,而且我喜欢用它(而不是矢量),以避免产生多余的内存,并避免复制存储器(它的成本时间 - 最短的时间成本是我的目标)。我不知道如何使用,而不是向量阵列。下面是我写的:

I didn't know about thrust, but now I guess I'm going to quit idea of writing own code. I'd rather use thrust. I modified my algorithm: instead -1 there are 0's (so make_transform is not necessary). Also your example creates array on host. But actually I have prepared array stored on device, and I' like to use it (instead of vectors) to avoid creating redundant memory and to avoid copying memory (it costs time - minimal time cost is my goal). I'm not sure how to use arrays instead of vectors. Here is what I've written:

int* dev_l_set = 0; 
cudaMalloc((void**)&dev_l_set, actualVerticesRowCount * sizeof(int)); 

...prepare array in kernel... 

thrust::device_vector<int> d_result(actualVerticesRowCount); 

thrust::inclusive_scan(dev_l_set, dev_l_set + actualVerticesRowCount, dev_l_set); 

using namespace thrust::placeholders; 
thrust::scatter_if(dev_l_set, dev_l_set + actualVerticesRowCount, thrust::make_counting_iterator(0), dev_l_set, d_result.begin(), _1 <= 0); 
cudaFree(dev_l_set); 

dev_l_set = thrust::raw_pointer_cast(d_result.data());

我无法从device_vector强制转换为int *,但我想存储在初始 dev_l_set 阵列扫描结果。此外,它会是巨大的,做到这一点的地方,就是有必要使用 d_result 在scatter_if?

I can't cast from device_vector to int*, but I'd like to store result of scanning in initial dev_l_set array. Also it'd be great to do it in place, is it necessary to use d_result in scatter_if?

实际输入(存储在为int * - 设备端):
(例子)

Actual Input (stored on int* - device side): (example)

dev_l_set[0] = 0
dev_l_set[1] = 2
dev_l_set[2] = 0
dev_l_set[3] = 3
dev_l_set[4] = 0
dev_l_set[5] = 1

所需的输出上面输入:

Desired output to the above input:

dev_l_set[0] = 0
dev_l_set[1] = 2
dev_l_set[2] = 0
dev_l_set[3] = 5
dev_l_set[4] = 0
dev_l_set[5] = 6

dev_l_set 应存储输入,然后做扫描的地方,到底应该存储输出。

dev_l_set should store input, then do scan in place and in the end it should store output.

这可能是这样的。

int* dev_l_set = 0; 
cudaMalloc((void**)&dev_l_set, actualVerticesRowCount * sizeof(int)); 

...prepare array in kernel... (see input data) 

thrust::inclusive_scan(dev_l_set, dev_l_set + actualVerticesRowCount, dev_l_set); 

using namespace thrust::placeholders; 
thrust::scatter_if(dev_l_set, dev_l_set + actualVerticesRowCount, thrust::make_counting_iterator(0), dev_l_set, dev_l_set, _1 <= 0); 

我的Cuda的版本(即最少应的应用程​​序的工作)是5.5(特斯拉M2070),不幸的是我不能使用C ++ 11。

My Cuda version (minimal that app should work) is 5.5 (Tesla M2070) and unfortunately I can't use c++11.

推荐答案

可以做到包容性扫描,以及在发生散射一步无需额外的结果向量。

You can do the inclusive scan as well as the scatter step in place without an additional result vector.

下面的例子直接使用从原始设备指针数据,而不推力:: device_vector 。包容性扫描后,previously 0 元素被恢复。

The following example directly uses the data from a raw device pointer without thrust::device_vector. After the inclusive scan, the previously 0 elements are restored.

由于@JaredHoberock指出,人们不应该依赖于居住在 code推力::细节。所以我编辑我的答案,并从的 推力::详细:: head_flags 直接进入这个例子。

As @JaredHoberock pointed out, one should not rely on code residing in thrust::detail. I therefore edited my answer and copied part of the code from thrust::detail::head_flags directly into this example.

#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>

#include <iostream>


// the following code is copied from <thrust/detail/range/head_flags.h>
#include <thrust/detail/config.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
#include <thrust/functional.h>


template<typename RandomAccessIterator,
     typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
     typename ValueType = bool,
     typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
  class head_flags
{

  public:
    struct head_flag_functor
    {
      BinaryPredicate binary_pred; // this must be the first member for performance reasons
      IndexType n;

      typedef ValueType result_type;

      __host__ __device__
      head_flag_functor(IndexType n)
    : binary_pred(), n(n)
      {}

      __host__ __device__
      head_flag_functor(IndexType n, BinaryPredicate binary_pred)
    : binary_pred(binary_pred), n(n)
      {}

      template<typename Tuple>
      __host__ __device__ __thrust_forceinline__
      result_type operator()(const Tuple &t)
      {
    const IndexType i = thrust::get<0>(t);

    // note that we do not dereference the tuple's 2nd element when i <= 0
    // and therefore do not dereference a bad location at the boundary
    return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
      }
    };

    typedef thrust::counting_iterator<IndexType> counting_iterator;

  public:
    typedef thrust::transform_iterator<
      head_flag_functor,
      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
    > iterator;

    __host__ __device__
    head_flags(RandomAccessIterator first, RandomAccessIterator last)
      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
                                            head_flag_functor(last - first))),
    m_end(m_begin + (last - first))
    {}

    __host__ __device__
    head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
                                            head_flag_functor(last - first, binary_pred))),
    m_end(m_begin + (last - first))
    {}

    __host__ __device__
    iterator begin() const
    {
      return m_begin;
    }

    __host__ __device__
    iterator end() const
    {
      return m_end;
    }

    template<typename OtherIndex>
    __host__ __device__
    typename iterator::reference operator[](OtherIndex i)
    {
      return *(begin() + i);
    }

  private:
    iterator m_begin, m_end;
};

template<typename RandomAccessIterator>
__host__ __device__
head_flags<RandomAccessIterator>
  make_head_flags(RandomAccessIterator first, RandomAccessIterator last)
{
  return head_flags<RandomAccessIterator>(first, last);
}


int main()
{
    // copy data to device, this will be produced by your kernel
    int array[] = {0,2,0,3,0,1};
    const int array_size = sizeof(array)/sizeof(array[0]);
    int* dev_l_set;
    cudaMalloc((void**)&dev_l_set, array_size * sizeof(int));
    cudaMemcpy(dev_l_set, array, array_size * sizeof(int), cudaMemcpyHostToDevice);

    // wrap raw pointer in a thrust::device_ptr so thrust knows that this memory is located on the GPU
    thrust::device_ptr<int> dev_ptr = thrust::device_pointer_cast(dev_l_set);
    thrust::inclusive_scan(dev_ptr,
                 dev_ptr+array_size,
                 dev_ptr);

    // copy result back to host for printing
    cudaMemcpy(array, dev_l_set, array_size * sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "after inclusive_scan" << std::endl;
    thrust::copy(array, array+array_size, std::ostream_iterator<int>(std::cout, " "));
    std::cout << std::endl;

    using namespace thrust::placeholders;
    thrust::scatter_if(thrust::make_constant_iterator(0),
             thrust::make_constant_iterator(0)+array_size,
             thrust::make_counting_iterator(0),
             make_head_flags(dev_ptr, dev_ptr+array_size).begin(),
             dev_ptr,
             !_1
            );

    // copy result back to host for printing
    cudaMemcpy(array, dev_l_set, array_size * sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "after scatter_if" << std::endl;
    thrust::copy(array, array+array_size, std::ostream_iterator<int>(std::cout, " "));
    std::cout << std::endl;
}

输出

after inclusive_scan
0 2 2 5 5 6 
after scatter_if
0 2 0 5 0 6 

这篇关于使用CUDA的推力与数组来代替向量来inclusive_scan的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆