简单推力代码的执行速度是我的天真cuda内核的一半。我使用Thrust是否错误? [英] Simple Thrust code performs about half as fast as my naive cuda kernel. Am I using Thrust wrong?

查看:856
本文介绍了简单推力代码的执行速度是我的天真cuda内核的一半。我使用Thrust是否错误?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我对Cuda和Thrust很新鲜,但我的印象是,当使用得当时,Thrust应该比天真的书面Cuda内核提供更好的性能。我使用推力在一个次优的方式吗?下面是一个完整的最小示例,它使用长度为 N + 2 的数组 u 1 N 之间的$ c> i 计算平均 0.5 *(u [i-1] + u [i + 1])并将结果放入 uNew [i] 。 ( uNew [0] 设置为 u [0] u [N + 1 ] 设置为 u [N + 1] ,以使边界项不改变)。代码执行这个平均大量次,以获得合理的时间进行时序测试。在我的硬件上,Thrust计算大约是天真代码的两倍。有没有办法改善我的Thrust代码?



  #include< iostream> #include< thrust / device_vector.h& ; boost / timer.hpp> #include< thrust / device_malloc.h> typedef double numtype; template< typename T> class NeighborAverageFunctor {int N; public:NeighborAverageFunctor(int _N){N = _N; } template< typename Tuple> __host____device__ void operator()(Tuple t){T uL = thrust :: get< 0(t); T uR = thrust :: get 1(t);推力:: get 2(t)= 0.5 *(uL + uR); } int getN(){return N; }}; template< typename T> void thrust_sweep(thrust :: device_ptr< T u,thrust :: device_ptr> uNew,NeighborAverageFunction< T& op){int N = op.getN(); push :: for_each(thrust :: make_zip_iterator(thrust :: make_tuple(u,u + 2,uNew + 1)),thrust :: make_zip_iterator(thrust :: make_tuple(u + N,u + N + 2,uNew + N +1)),op); //传播边界值而不改变它们uNew [0] = u [0]; uNew [N + 1] = u [N + 1];} template< typename T& __global__ void initialization_kernel(int n,T * u){const int i = blockIdx.x * blockDim.x + threadIdx.x; if(i  = 1&& i  >(N + 2,hSquared,u_raw_d,uNew_raw_d) uTemp_raw_d = u_raw_d; u_raw_d = uNew_raw_d; uNew_raw_d = uTemp_raw_d; } double raw_time = timer2.elapsed(); cudaMemcpy(u_raw_h,u_raw_d,(N + 2)* sizeof(numtype),cudaMemcpyDeviceToHost); for(int i = 0; i <10; i ++){std :: cout < u_raw_h [i]< ; } std :: cout<< std :: endl; std :: cout<< 推力:<推力_时间< s< std :: endl; std :: cout<< Raw:<< raw_time< s< std :: endl; free(u_raw_h); cudaFree(u_raw_d); cudaFree(uNew_raw_d); return 0;}  

解决方案>根据我的测试,这些行:

  uNew [0] = u [0] 
uNew [N + 1] = u [N + 1];

正在杀死相对于内核方法的推力性能。当我消除它们,结果似乎没有任何不同。与你的内核如何处理边界情况相比,推力代码使用一个非常昂贵的方法( cudaMemcpy 操作)来执行边界处理。



由于你的推式函子从来没有写入到边界位置,所以这些值只能写入一次,而不是循环。



你可以通过更好地处理边界情况来加快你的推力性能。


I'm pretty new to Cuda and Thrust, but my impression was that Thrust, when used well, is supposed to offer better performance than naively written Cuda kernels. Am I using Thrust in a sub-optimal way? Below is a complete, minimal example that takes an array u of length N+2, and for each i between 1 and N computes the average 0.5*(u[i-1] + u[i+1]) and puts the result in uNew[i]. (uNew[0] is set to u[0] and u[N+1] is set to u[N+1] so that the boundary terms don't change). The code performs this averaging a large number of times to get reasonable times for timing tests. On my hardware, the Thrust computation takes roughly twice as long as the naive code. Is there a way to improve my Thrust code?

#include <iostream>
#include <thrust/device_vector.h>
#include <boost/timer.hpp>
#include <thrust/device_malloc.h>

typedef double numtype;

template <typename T> class NeighborAverageFunctor{
	int N;
public:
	NeighborAverageFunctor(int _N){
		N = _N;
	}
	template <typename Tuple>
	__host__ __device__ void operator()(Tuple t){
		T uL = thrust::get<0>(t);
		T uR = thrust::get<1>(t);

		thrust::get<2>(t) = 0.5*(uL + uR);
	}

	int getN(){
		return N;
	}
};

template <typename T> void thrust_sweep(thrust::device_ptr<T> u, thrust::device_ptr<T> uNew, NeighborAverageFunctor<T>& op){
	int N = op.getN();
	thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(u, u + 2, uNew + 1)), thrust::make_zip_iterator(thrust::make_tuple(u + N, u + N+2, uNew + N+1)), op);
	// Propagate boundary values without changing them
	uNew[0] = u[0];
	uNew[N+1] = u[N+1];
}


template <typename T> __global__ void initialization_kernel(int n, T* u){
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i < n+2){
		if(i == 0){
			u[i] = 1.0;
		}
		else{
			u[i] = 0.0;
		}
	}
}

template <typename T> __global__ void sweep_kernel(int n, T, T* u, T* uNew){
	const int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i >= 1 && i < n-1){
		uNew[i] = 0.5*(u[i+1] + u[i-1]);
	}
	else if(i == 0 || i == n+1){
		uNew[i] = u[i];
	}
}

int main(void){
	int sweeps = 2000;
	int N = 4096*2048;
	numtype h = 1.0/N;
	numtype hSquared = pow(h, 2);

	NeighborAverageFunctor<numtype> op(N);

	thrust::device_ptr<numtype> u_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uNew_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uTemp_d;

	thrust::fill(u_d, u_d + (N+2), 0.0);
	u_d[0] = 1.0;

	boost::timer::timer timer1;

	for(int k = 0; k < sweeps; k++){
		thrust_sweep<numtype>(u_d, uNew_d, op);
		uTemp_d = u_d;
		u_d = uNew_d;
		uNew_d = uTemp_d;
	}

	double thrust_time = timer1.elapsed();

	thrust::host_vector<numtype> u_h(N+2);
	thrust::copy(u_d, u_d + N+2, u_h.begin());
	for(int i = 0; i < 10; i++){
		std::cout << u_h[i] << " ";
	}
	std::cout << std::endl;

	thrust::device_free(u_d);
	thrust::device_free(uNew_d);

	numtype * u_raw_d, * uNew_raw_d, * uTemp_raw_d;
	cudaMalloc(&u_raw_d, (N+2)*sizeof(numtype));
	cudaMalloc(&uNew_raw_d, (N+2)*sizeof(numtype));

	numtype * u_raw_h = (numtype*)malloc((N+2)*sizeof(numtype));

	int block_size = 256;
	int grid_size = ((N+2) + block_size - 1) / block_size;

	initialization_kernel<numtype><<<grid_size, block_size>>>(N, u_raw_d);

	boost::timer::timer timer2;

	for(int k = 0; k < sweeps; k++){
		sweep_kernel<numtype><<<grid_size, block_size>>>(N+2, hSquared, u_raw_d, uNew_raw_d);
		uTemp_raw_d = u_raw_d;
		u_raw_d = uNew_raw_d;
		uNew_raw_d = uTemp_raw_d;
	}

	double raw_time = timer2.elapsed();

	cudaMemcpy(u_raw_h, u_raw_d, (N+2)*sizeof(numtype), cudaMemcpyDeviceToHost);

	for(int i = 0; i < 10; i++){
		std::cout << u_raw_h[i] << " ";
	}
	std::cout << std::endl;

	std::cout << "Thrust: " << thrust_time << " s" << std::endl;
	std::cout << "Raw: " << raw_time << " s" << std::endl;

	free(u_raw_h);

	cudaFree(u_raw_d);
	cudaFree(uNew_raw_d);

	return 0;
}

解决方案

According to my testing, these lines:

uNew[0] = u[0];
uNew[N+1] = u[N+1];

are killing your thrust performance relative to the kernel method. When I eliminate them, the results don't seem to be any different. Compared to how your kernel is handling the boundary cases, the thrust code is using a very expensive method (cudaMemcpy operations, under the hood) to perform the boundary handling.

Since your thrust functor never actually writes to the boundary positions, it should be sufficient to write these values only once, rather than in a loop.

You can speed up your thrust performance significantly by doing a better job of handling the boundary cases.

这篇关于简单推力代码的执行速度是我的天真cuda内核的一半。我使用Thrust是否错误?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆