如何异步复制内存从主机到设备使用推力和CUDA流 [英] How to asynchronously copy memory from the host to the device using thrust and CUDA streams

查看:445
本文介绍了如何异步复制内存从主机到设备使用推力和CUDA流的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想使用如

中的推力将内存从主机复制到设备:

  thrust :: host_vector< float> h_vec(1 << 28); 
thrust :: device_vector< float> d_vec(1 << 28);
thrust :: copy(h_vec.begin(),h_vec.end(),d_vec.begin());类似于使用流将设备中的内存复制到设备的方式,使用CUDA流:



  cudaStream_t s; 
cudaStreamCreate(& s);

thrust :: device_vector< float> d_vec1(1 <28),d_vec2(1 <28);
thrust :: copy(thrust :: cuda :: par.on(s),d_vec1.begin(),d_vec1.end(),d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

问题是我无法将执行策略设置为CUDA,主机到设备,因为在这种情况下,推力将假设两个向量都存储在设备上。有办法解决这个问题吗?我使用的是来自github的最新推送版本(在version.h文件中为1.8)。

解决方案

评论,我不认为这将是可能的直接与 thrust :: copy 。但是,我们可以在推力应用程序中使用 cudaMemcpyAsync 来实现异步副本和副本与compute的重叠目标。



这是一个工作示例:

  #include< thrust / host_vector.h> 
#include< thrust / device_vector.h>
#include< thrust / system / cuda / experimental / pinned_allocator.h>
#include< thrust / system / cuda / execution_policy.h>
#include< thrust / fill.h>
#include< thrust / sequence.h>
#include< thrust / for_each.h>
#include< iostream>

// DSIZE确定H2D和D2H传输的持续时间
#define DSIZE(1048576 * 8)
// SSIZE,LSIZE确定推送内核的持续时间
#define SSIZE(1024 * 512)
#define LSIZE 1
// KSIZE决定推力内核的大小(每个块的线程数)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust :: host_vector< mytype,thrust :: cuda :: experimental :: pinned_allocator< mytype> > pinnedVector;

struct sum_functor
{
mytype * dptr;
sum_functor(mytype * _dptr):dptr(_dptr){};
__host__ __device__ void operator()(mytype& data)const
{
mytype result = data;
for(int j = 0; j for(int i = 0; i result + = dptr [i]
data = result;
}
};

int main(){

pinnedVector hi1(DSIZE);
pinnedVector hi2(DSIZE);
pinnedVector ho1(DSIZE);
pinnedVector ho2(DSIZE);
thrust :: device_vector< mytype> di1(DSIZE);
thrust :: device_vector< mytype> di2(DSIZE);
thrust :: device_vector< mytype> do1(DSIZE);
thrust :: device_vector< mytype> do2(DSIZE);
thrust :: device_vector< mytype> dc1(KSIZE);
thrust :: device_vector< mytype> dc2(KSIZE);

thrust :: fill(hi1.begin(),hi1.end(),TV1);
thrust :: fill(hi2.begin(),hi2.end(),TV2);
thrust :: sequence(do1.begin(),do1.end());
thrust :: sequence(do2.begin(),do2.end());

cudaStream_t s1,s2;
cudaStreamCreate(& s1); cudaStreamCreate(& s2);

cudaMemcpyAsync(thrust :: raw_pointer_cast(di1.data()),thrust :: raw_pointer_cast(hi1.data()),di1.size()* sizeof(mytype),cudaMemcpyHostToDevice,s1);
cudaMemcpyAsync(thrust :: raw_pointer_cast(di2.data()),thrust :: raw_pointer_cast(hi2.data()),di2.size()* sizeof(mytype),cudaMemcpyHostToDevice,s2);

thrust :: for_each(thrust :: cuda :: par.on(s1),do1.begin(),do1.begin()+ KSIZE,sum_functor(thrust :: raw_pointer_cast ())));
thrust :: for_each(thrust :: cuda :: par.on(s2),do2.begin(),do2.begin()+ KSIZE,sum_functor(thrust :: raw_pointer_cast(di2.data())) );

cudaMemcpyAsync(thrust :: raw_pointer_cast(ho1.data()),thrust :: raw_pointer_cast(do1.data()),do1.size()* sizeof(mytype),cudaMemcpyDeviceToHost,s1);
cudaMemcpyAsync(thrust :: raw_pointer_cast(ho2.data()),thrust :: raw_pointer_cast(do2.data()),do2.size()* sizeof(mytype),cudaMemcpyDeviceToHost,s2);

cudaDeviceSynchronize();
for(int i = 0; i if(ho1 [i]!=((LSIZE * SSIZE * TV1)+ i)){std :: cout < 流1上的不匹配在 i<< was:< ho1 [i]< 应该是:< ((DSIZE * TV1)+ i) std :: endl;返回1;}
if(ho2 [i]!=((LSIZE * SSIZE * TV2)+ i)){std :: cout< 流2上的不匹配在 i<< was:< ho2 [i]< 应该是:< ((DSIZE * TV2)+ i) std :: endl; return 1;}
}
std :: cout<< 成功! << std :: endl;
return 0;
}

对于我的测试用例,我使用RHEL5.5,Quadro5000和cuda 6.5 RC。这个例子设计为推力创建非常小的内核(只有一个线程块,只要 KSIZE 很小,比如32或64),这样,从 thrust :: for_each 可以同时运行。



p>



这表明我们在推力内核之间以及在复制操作和推力内核之间实现了适当的重叠,以及在内核完成时的异步数据复制。注意, cudaDeviceSynchronize()操作填充时间线,表示所有异步操作(数据复制,推力函数)异步发出,控制返回到主线程任何操作正在进行中。所有这些都是期望的,主机,GPU和数据复制操作之间的完全并发的正确行为。


I would like to copy memory from the host to the device using thrust as in

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

using CUDA streams analogously to how you would copy memory from the device to the device using streams:

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

The problem is that I can't set the execution policy to CUDA to specify the stream when copying from the host to the device, because, in that case, thrust would assume that both vectors are stored on the device. Is there a way to get around this problem? I'm using the latest thrust version from github (it says 1.8 in the version.h file).

解决方案

As indicated in the comments, I don't think this will be possible directly with thrust::copy. However we can use cudaMemcpyAsync in a thrust application to achieve the goal of asynchronous copies and overlap of copy with compute.

Here is a worked example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>

// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;

struct sum_functor
{
  mytype *dptr;
  sum_functor(mytype* _dptr) : dptr(_dptr) {};
  __host__ __device__ void operator()(mytype &data) const
    {
      mytype result = data;
      for (int j = 0; j < LSIZE; j++)
        for (int i = 0; i < SSIZE; i++)
          result += dptr[i];
      data = result;
    }
};

int main(){

  pinnedVector hi1(DSIZE);
  pinnedVector hi2(DSIZE);
  pinnedVector ho1(DSIZE);
  pinnedVector ho2(DSIZE);
  thrust::device_vector<mytype> di1(DSIZE);
  thrust::device_vector<mytype> di2(DSIZE);
  thrust::device_vector<mytype> do1(DSIZE);
  thrust::device_vector<mytype> do2(DSIZE);
  thrust::device_vector<mytype> dc1(KSIZE);
  thrust::device_vector<mytype> dc2(KSIZE);

  thrust::fill(hi1.begin(), hi1.end(),  TV1);
  thrust::fill(hi2.begin(), hi2.end(),  TV2);
  thrust::sequence(do1.begin(), do1.end());
  thrust::sequence(do2.begin(), do2.end());

  cudaStream_t s1, s2;
  cudaStreamCreate(&s1); cudaStreamCreate(&s2);

  cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);

  thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
  thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));

  cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);

  cudaDeviceSynchronize();
  for (int i=0; i < KSIZE; i++){
    if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
    if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
    }
  std::cout << "Success!" << std::endl;
  return 0;
}

For my test case, I used RHEL5.5, Quadro5000, and cuda 6.5RC. This example is designed to have thrust create very small kernels (only a single threadblock, as long as KSIZE is small, say 32 or 64), so that the kernels that thrust creates from thrust::for_each are able to run concurrently.

When I profile this code, I see:

This indicates that we are achieving proper overlap both between thrust kernels, and between copy operations and thrust kernels, as well as asynchronous data copying at the completion of the kernels. Note that the cudaDeviceSynchronize() operation "fills" the timeline, indicating that all the async operations (data copying, thrust functions) were issued asynchronously and control returned to the host thread before any of the operations were underway. All of this is expected, proper behavior for full concurrency between host, GPU, and data copying operations.

这篇关于如何异步复制内存从主机到设备使用推力和CUDA流的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆