thrust :: max_element慢比较cublasIsamax - 更高效的实现? [英] thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
问题描述
我需要一个快速有效的实现来找到CUDA中数组中的最大值的索引。此操作需要执行几次。我最初使用cublasIsamax为此,但是,它可悲的是返回最大绝对值的索引,这不是我想要的。相反,我使用thrust :: max_element,但是速度比cublasIsamax相当慢。我以下列方式使用它:
// d_vector是设备上指向向量开头的指针,包含nrElements浮动。
thrust :: device_ptr< float> d_ptr = thrust :: device_pointer_cast(d_vector);
thrust :: device_vector< float> :: iterator d_it = thrust :: max_element(d_ptr,d_ptr + nrElements);
max_index = d_it - (thrust :: device_vector< float> :: iterator)d_ptr;
向量中元素的数量范围介于10'000到20'000之间。推力:: max_element和cublasIsamax之间的速度差异相当大。
更有效的实现方法是编写自己的最大索引减少代码在CUDA。
我们可以比较3种方法:
-
thrust :: max_element
-
cublasIsamax
- 自订CUDA内核
完全工作示例:
$ cat t665.cu
#include< cublas_v2.h>
#include< thrust / extrema.h>
#include< thrust / device_ptr.h>
#include< thrust / device_vector.h>
#include< iostream>
#include< stdlib.h>
#define DSIZE 10000
// nTPB应该是2的幂次
#define nTPB 256
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE / nTPB)+1)
#define MIN(a,b)((a> b)?b:a)
#define FLOAT_MIN -1.0f
#include< time.h>
#include< sys / time.h>
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(& tv1,0);
return((tv1.tv_sec * USECPSEC)+ tv1.tv_usec) - prev;
}
__device__ volatile float blk_vals [MAX_BLOCKS];
__device__ volatile int blk_idxs [MAX_BLOCKS];
__device__ int blk_num = 0;
template< typename T>
__global__ void max_idx_kernel(const T * data,const int dsize,int * result){
__shared__ volatile T vals [nTPB];
__shared__ volatile int idxs [nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x + blockDim.x * blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
//从全局存储器中扫描
while(idx< dsize){
if(data [idx]> my_val){my_val = data [idx] my_idx = idx;}
idx + = blockDim.x * gridDim.x;}
//填充共享内存
vals [threadIdx.x] = my_val;
idxs [threadIdx.x] = my_idx;
__syncthreads();
//清除共享内存
for(int i =(nTPB> 1); i> 0; i>> = 1){
if(threadIdx.x& ; i)
if(vals [threadIdx.x]< vals [threadIdx.x + i]){vals [threadIdx.x] = vals [threadIdx.x + i]; idxs [threadIdx.x] = idxs [threadIdx.x + i]; }
__syncthreads();}
//执行块级别减少
if(!threadIdx.x){
blk_vals [blockIdx.x] = vals [0];
blk_idxs [blockIdx.x] = idxs [0];
if(atomicAdd(& blk_num,1)== gridDim.x - 1)//然后我是最后一个块
last_block = 1;}
__syncthreads
if(last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while(idx< gridDim.x){
if(blk_vals [idx]> my_val){my_val = blk_vals [idx] my_idx = blk_idxs [idx]; }
idx + = blockDim.x;}
//填充共享内存
vals [threadIdx.x] = my_val;
idxs [threadIdx.x] = my_idx;
__syncthreads();
//清除共享内存
for(int i =(nTPB> 1); i> 0; i>> = 1){
if(threadIdx.x& ; i)
if(vals [threadIdx.x]< vals [threadIdx.x + i]){vals [threadIdx.x] = vals [threadIdx.x + i]; idxs [threadIdx.x] = idxs [threadIdx.x + i]; }
__syncthreads();}
if(!threadIdx.x)
* result = idxs [0];
}
}
int main(){
int nrElements = DSIZE;
float * d_vector,* h_vector;
h_vector = new float [DSIZE];
for(int i = 0; i h_vector [10] = 10; // create definite max element
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(& my_handle);
cudaMalloc(& d_vector,DSIZE * sizeof(float));
cudaMemcpy(d_vector,h_vector,DSIZE * sizeof(float),cudaMemcpyHostToDevice);
int max_index = 0;
unsigned long long dtime = dtime_usec(0);
// d_vector是设备上指向向量开头的指针,包含nrElements个浮点数。
thrust :: device_ptr< float> d_ptr = thrust :: device_pointer_cast(d_vector);
thrust :: device_vector< float> :: iterator d_it = thrust :: max_element(d_ptr,d_ptr + nrElements);
max_index = d_it - (thrust :: device_vector< float> :: iterator)d_ptr;
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std :: cout<< 推力时间:< dtime /(float)USECPSEC < max index:<< max_index<< std :: endl;
max_index = 0;
dtime = dtime_usec(0);
my_status = cublasIsamax(my_handle,DSIZE,d_vector,1,& max_index);
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std :: cout<< cublas time:< dtime /(float)USECPSEC < max index:< max_index<< std :: endl;
max_index = 0;
int * d_max_index;
cudaMalloc(& d_max_index,sizeof(int));
dtime = dtime_usec(0);
max_idx_kernel
cudaMemcpy(& max_index,d_max_index,sizeof(int),cudaMemcpyDeviceToHost);
dtime = dtime_usec(dtime);
std :: cout<< 内核时间:< dtime /(float)USECPSEC < max index:<< max_index<< std :: endl;
return 0;
}
$ nvcc -O3 -arch = sm_20 -o t665 t665.cu -lcublas
$ ./t665
推力时间:0.00075最大值索引:10
cublas时间:6.3e-05最大索引:11
内核时间:2.5e-05最大索引:10
$
注意:
- CUBLAS返回的索引值高于其他值,因为 CUBLAS使用基于1的索引。
- CUBLAS 可能会更快,但如果您使用
CUBLAS_POINTER_MODE_DEVICE
- CUBLAS与
CUBLAS_POINTER_MODE_DEVICE
应该是异步的,因此cudaDeviceSynchronize()
将是基于主机的计时,我在这里展示的。在某些情况下,推力也可以是异步的。 - 为了CUBLAS和其他方法之间的方便和结果比较,我使用所有非负值作为我的数据。如果您使用负值,您可能需要调整
FLOAT_MIN
值。 - 如果您对性能可以尝试调整
nTPB
和MAX_KERNEL_BLOCKS
参数,看看是否可以最大限度地提高特定GPU的性能。内核代码还可以通过在(两个)线程块减少的最后阶段不仔细地切换到扭曲同步模式而在表上留下一些性能。 - 线程块减少内核使用块消耗/最后块策略来避免额外内核启动的开销,以执行最终减少。
I need a fast and efficient implementation for finding the index of the maximum value in an array in CUDA. This operation needs to be performed several times. I originally used cublasIsamax for this, however, it sadly returns the index of the maximum absolute value, which is not what I want. Instead, I'm using thrust::max_element, however the speed is rather slow in comparison to cublasIsamax. I use it in the following manner:
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
The number of elements in the vector range between 10'000 and 20'000. The difference in speed between thrust::max_element and cublasIsamax is rather big. Perhaps I'm performing several memory transactions without knowing?
A more efficient implementation would be to write your own max-index reduction code in CUDA. It's likely that cublasIsamax
is using something like this under the hood.
We can compare 3 approaches:
thrust::max_element
cublasIsamax
- custom CUDA kernel
Here's a fully worked example:
$ cat t665.cu
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <stdlib.h>
#define DSIZE 10000
// nTPB should be a power-of-2
#define nTPB 256
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <time.h>
#include <sys/time.h>
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
template <typename T>
__global__ void max_idx_kernel(const T *data, const int dsize, int *result){
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
h_vector = new float[DSIZE];
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10] = 10; // create definite max element
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
int max_index = 0;
unsigned long long dtime = dtime_usec(0);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "thrust time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
max_index = 0;
dtime = dtime_usec(0);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "cublas time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
max_index = 0;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
dtime = dtime_usec(0);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
dtime = dtime_usec(dtime);
std::cout << "kernel time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t665 t665.cu -lcublas
$ ./t665
thrust time: 0.00075 max index: 10
cublas time: 6.3e-05 max index: 11
kernel time: 2.5e-05 max index: 10
$
Notes:
- CUBLAS returns an index 1 higher than the others because CUBLAS uses 1-based indexing.
- CUBLAS might be quicker if you used
CUBLAS_POINTER_MODE_DEVICE
, however for validation you would still have to copy the result back to the host. - CUBLAS with
CUBLAS_POINTER_MODE_DEVICE
should be asynchronous, so thecudaDeviceSynchronize()
will be desirable for the host based timing I've shown here. In some cases, thrust can be asynchronous as well. - For convenience and results comparison between CUBLAS and the other methods, I am using all nonnegative values for my data. You may want to adjust the
FLOAT_MIN
value if you are using negative values as well. - If you're freaky about performance, you can try tuning the
nTPB
andMAX_KERNEL_BLOCKS
parameters to see if you can max out performance on your specific GPU. The kernel code also arguably leaves some performance on the table by not switching carefully into a warp-synchronous mode for the final stages of the (two) threadblock reduction(s). - The threadblock reduction kernel uses a block-draining/last-block strategy to avoid the overhead of an additional kernel launch to perform the final reduction.
这篇关于thrust :: max_element慢比较cublasIsamax - 更高效的实现?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!